Repository: spark Updated Branches: refs/heads/master 2460f03ff -> 180fd3e0a
http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala index 75ba33a..989d250 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Normalizer +import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession @@ -31,7 +32,11 @@ object NormalizerExample { .getOrCreate() // $example on$ - val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.createDataFrame(Seq( + (0, Vectors.dense(1.0, 0.5, -1.0)), + (1, Vectors.dense(2.0, 1.0, 1.0)), + (2, Vectors.dense(4.0, 10.0, 2.0)) + )).toDF("id", "features") // Normalize each Vector using $L^1$ norm. val normalizer = new Normalizer() @@ -40,10 +45,12 @@ object NormalizerExample { .setP(1.0) val l1NormData = normalizer.transform(dataFrame) + println("Normalized using L^1 norm") l1NormData.show() // Normalize each Vector using $L^\infty$ norm. val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity) + println("Normalized using L^inf norm") lInfNormData.show() // $example off$ http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala index 4aa649b..274cc12 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala @@ -49,8 +49,9 @@ object OneHotEncoderExample { val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") + val encoded = encoder.transform(indexed) - encoded.select("id", "categoryVec").show() + encoded.show() // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index acde110..4ad6c7c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -69,7 +69,7 @@ object OneVsRestExample { // compute the classification error on test data. val accuracy = evaluator.evaluate(predictions) - println(s"Test Error : ${1 - accuracy}") + println(s"Test Error = ${1 - accuracy}") // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala index dca96ee..4e1d7cd 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala @@ -38,14 +38,15 @@ object PCAExample { Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") + val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) - val pcaDF = pca.transform(df) - val result = pcaDF.select("pcaFeatures") - result.show() + + val result = pca.transform(df).select("pcaFeatures") + result.show(false) // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala index 54d2e6b..f117b03 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala @@ -33,17 +33,19 @@ object PolynomialExpansionExample { // $example on$ val data = Array( - Vectors.dense(-2.0, 2.3), + Vectors.dense(2.0, 1.0), Vectors.dense(0.0, 0.0), - Vectors.dense(0.6, -1.1) + Vectors.dense(3.0, -1.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") - val polynomialExpansion = new PolynomialExpansion() + + val polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) - val polyDF = polynomialExpansion.transform(df) - polyDF.select("polyFeatures").take(3).foreach(println) + + val polyDF = polyExpansion.transform(df) + polyDF.show(false) // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala index a56de08..369a6ff 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala @@ -40,7 +40,7 @@ object StopWordsRemoverExample { (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") - remover.transform(dataSet).show() + remover.transform(dataSet).show(false) // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 97f6fcc..ec2df2e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -33,9 +33,9 @@ object TfIdfExample { // $example on$ val sentenceData = spark.createDataFrame(Seq( - (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (0.0, "Hi I heard about Spark"), + (0.0, "I wish Java could use case classes"), + (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") @@ -51,7 +51,7 @@ object TfIdfExample { val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) - rescaledData.select("features", "label").take(3).foreach(println) + rescaledData.select("label", "features").show() // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala index 90d0faa..0167dc3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} +import org.apache.spark.sql.functions._ // $example off$ import org.apache.spark.sql.SparkSession @@ -35,7 +36,7 @@ object TokenizerExample { (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") - )).toDF("label", "sentence") + )).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() @@ -43,11 +44,15 @@ object TokenizerExample { .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + val countTokens = udf { (words: Seq[String]) => words.length } + val tokenized = tokenizer.transform(sentenceDataFrame) - tokenized.select("words", "label").take(3).foreach(println) + tokenized.select("sentence", "words") + .withColumn("tokens", countTokens(col("words"))).show(false) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) - regexTokenized.select("words", "label").take(3).foreach(println) + regexTokenized.select("sentence", "words") + .withColumn("tokens", countTokens(col("words"))).show(false) // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala index 13c72f8..13b58d1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala @@ -100,6 +100,7 @@ object UnaryTransformerExample { val data = spark.range(0, 5).toDF("input") .select(col("input").cast("double").as("input")) val result = myTransformer.transform(data) + println("Transformed by adding constant value") result.show() // Save and load the Transformer. @@ -109,6 +110,7 @@ object UnaryTransformerExample { val sameTransformer = MyTransformer.load(dirName) // Transform the data to show the results are identical. + println("Same transform applied from loaded model") val sameResult = sameTransformer.transform(data) sameResult.show() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala index 8910470..3d5c7ef 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala @@ -41,7 +41,8 @@ object VectorAssemblerExample { .setOutputCol("features") val output = assembler.transform(dataset) - println(output.select("features", "clicked").first()) + println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") + output.select("features", "clicked").show(false) // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala index 85dd5c2..63a6091 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala @@ -37,7 +37,10 @@ object VectorSlicerExample { .getOrCreate() // $example on$ - val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0))) + val data = Arrays.asList( + Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), + Row(Vectors.dense(-2.0, 2.3, 0.0)) + ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) @@ -51,7 +54,7 @@ object VectorSlicerExample { // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) - println(output.select("userFeatures", "features").first()) + output.show(false) // $example off$ spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/180fd3e0/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala index 5c8bd19..4bcc6ac 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala @@ -20,6 +20,8 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession @@ -47,7 +49,8 @@ object Word2VecExample { val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) - result.select("result").take(3).foreach(println) + result.collect().foreach { case Row(text: Seq[_], features: Vector) => + println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org