[1/2] spark git commit: [SPARK-19533][EXAMPLES] Convert Java tests to use lambdas, Java 8 features
Repository: spark Updated Branches: refs/heads/master ba8912e5f -> de14d35f7 http://git-wip-us.apache.org/repos/asf/spark/blob/de14d35f/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java index f69aa4b..1ee68da 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java @@ -21,7 +21,6 @@ package org.apache.spark.examples.mllib; import scala.Tuple2; import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.recommendation.ALS; import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; import org.apache.spark.mllib.recommendation.Rating; @@ -37,15 +36,12 @@ public class JavaRecommendationExample { // Load and parse the data String path = "data/mllib/als/test.data"; JavaRDD data = jsc.textFile(path); -JavaRDD ratings = data.map( - new Function() { -public Rating call(String s) { - String[] sarray = s.split(","); - return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]), -Double.parseDouble(sarray[2])); -} - } -); +JavaRDD ratings = data.map(s -> { + String[] sarray = s.split(","); + return new Rating(Integer.parseInt(sarray[0]), +Integer.parseInt(sarray[1]), +Double.parseDouble(sarray[2])); +}); // Build the recommendation model using ALS int rank = 10; @@ -53,37 +49,19 @@ public class JavaRecommendationExample { MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01); // Evaluate the model on rating data -JavaRDD> userProducts = ratings.map( - new Function>() { -public Tuple2 call(Rating r) { - return new Tuple2(r.user(), r.product()); -} - } -); +JavaRDD> userProducts = + ratings.map(r -> new Tuple2<>(r.user(), r.product())); JavaPairRDD, Double> predictions = JavaPairRDD.fromJavaRDD( - model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map( -new Function, Double>>() { - public Tuple2, Double> call(Rating r){ -return new Tuple2<>(new Tuple2<>(r.user(), r.product()), r.rating()); - } -} - )); -JavaRDD> ratesAndPreds = - JavaPairRDD.fromJavaRDD(ratings.map( -new Function, Double>>() { - public Tuple2, Double> call(Rating r){ -return new Tuple2<>(new Tuple2<>(r.user(), r.product()), r.rating()); - } -} - )).join(predictions).values(); -double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map( - new Function, Object>() { -public Object call(Tuple2 pair) { - Double err = pair._1() - pair._2(); - return err * err; -} - } -).rdd()).mean(); + model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD() + .map(r -> new Tuple2<>(new Tuple2<>(r.user(), r.product()), r.rating())) +); +JavaRDD> ratesAndPreds = JavaPairRDD.fromJavaRDD( +ratings.map(r -> new Tuple2<>(new Tuple2<>(r.user(), r.product()), r.rating( + .join(predictions).values(); +double MSE = ratesAndPreds.mapToDouble(pair -> { + double err = pair._1() - pair._2(); + return err * err; +}).mean(); System.out.println("Mean Squared Error = " + MSE); // Save and load model http://git-wip-us.apache.org/repos/asf/spark/blob/de14d35f/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java index b3e5c04..7bb9993 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java @@ -21,7 +21,6 @@ package org.apache.spark.examples.mllib; import scala.Tuple2; import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.regression.LinearRegressionModel; @@ -38,34 +37,24 @@ public class JavaRegressionMetricsExample { // Load and parse the data String path = "data/mllib/sample_linear_regression_data.txt"; JavaRDD data = sc.textFile(path); -JavaRDD parsedData = data.map( - new Fun
[2/2] spark git commit: [SPARK-19533][EXAMPLES] Convert Java tests to use lambdas, Java 8 features
[SPARK-19533][EXAMPLES] Convert Java tests to use lambdas, Java 8 features ## What changes were proposed in this pull request? Convert Java tests to use lambdas, Java 8 features. ## How was this patch tested? Jenkins tests. Author: Sean Owen Closes #16961 from srowen/SPARK-19533. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de14d35f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de14d35f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de14d35f Branch: refs/heads/master Commit: de14d35f77071932963a994fac5aec0e5df838a1 Parents: ba8912e Author: Sean Owen Authored: Sun Feb 19 09:37:56 2017 -0800 Committer: Sean Owen Committed: Sun Feb 19 09:37:56 2017 -0800 -- .../org/apache/spark/examples/JavaLogQuery.java | 21 +-- .../org/apache/spark/examples/JavaPageRank.java | 49 ++- .../org/apache/spark/examples/JavaSparkPi.java | 20 +-- .../spark/examples/JavaStatusTrackerDemo.java | 5 +- .../java/org/apache/spark/examples/JavaTC.java | 8 +- .../apache/spark/examples/JavaWordCount.java| 27 +--- .../spark/examples/ml/JavaALSExample.java | 7 +- ...SelectionViaTrainValidationSplitExample.java | 3 - .../spark/examples/ml/JavaTokenizerExample.java | 13 +- .../examples/ml/JavaVectorSlicerExample.java| 7 +- .../mllib/JavaAssociationRulesExample.java | 6 +- .../JavaBinaryClassificationMetricsExample.java | 33 ++--- .../mllib/JavaBisectingKMeansExample.java | 7 +- .../mllib/JavaChiSqSelectorExample.java | 38 ++ .../JavaDecisionTreeClassificationExample.java | 26 +--- .../JavaDecisionTreeRegressionExample.java | 33 ++--- .../mllib/JavaElementwiseProductExample.java| 27 +--- .../mllib/JavaGaussianMixtureExample.java | 19 +-- ...vaGradientBoostingClassificationExample.java | 21 +-- .../JavaGradientBoostingRegressionExample.java | 30 + .../mllib/JavaIsotonicRegressionExample.java| 39 ++ .../spark/examples/mllib/JavaKMeansExample.java | 19 +-- .../spark/examples/mllib/JavaLBFGSExample.java | 23 +--- .../JavaLatentDirichletAllocationExample.java | 28 ++-- .../JavaLinearRegressionWithSGDExample.java | 47 +++ .../JavaLogisticRegressionWithLBFGSExample.java | 14 +- ...aMulticlassClassificationMetricsExample.java | 13 +- .../examples/mllib/JavaNaiveBayesExample.java | 19 +-- .../JavaPowerIterationClusteringExample.java| 6 +- .../JavaRandomForestClassificationExample.java | 23 +--- .../JavaRandomForestRegressionExample.java | 37 ++--- .../mllib/JavaRankingMetricsExample.java| 135 ++- .../mllib/JavaRecommendationExample.java| 58 +++- .../mllib/JavaRegressionMetricsExample.java | 31 ++--- .../examples/mllib/JavaSVMWithSGDExample.java | 13 +- .../examples/mllib/JavaSimpleFPGrowth.java | 12 +- .../mllib/JavaStreamingTestExample.java | 40 ++ .../examples/sql/JavaSQLDataSourceExample.java | 8 +- .../spark/examples/sql/JavaSparkSQLExample.java | 60 +++-- .../examples/sql/hive/JavaSparkHiveExample.java | 9 +- .../streaming/JavaStructuredKafkaWordCount.java | 10 +- .../JavaStructuredNetworkWordCount.java | 11 +- .../JavaStructuredNetworkWordCountWindowed.java | 16 +-- .../examples/streaming/JavaCustomReceiver.java | 34 + .../streaming/JavaDirectKafkaWordCount.java | 31 + .../examples/streaming/JavaFlumeEventCount.java | 8 +- .../examples/streaming/JavaKafkaWordCount.java | 33 + .../streaming/JavaNetworkWordCount.java | 25 +--- .../examples/streaming/JavaQueueStream.java | 24 +--- .../JavaRecoverableNetworkWordCount.java| 91 + .../streaming/JavaSqlNetworkWordCount.java | 51 +++ .../streaming/JavaStatefulNetworkWordCount.java | 30 + 52 files changed, 380 insertions(+), 1018 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/de14d35f/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java b/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java index 7775443..cf12de3 100644 --- a/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java +++ b/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java @@ -17,18 +17,16 @@ package org.apache.spark.examples; -import com.google.common.collect.Lists; import scala.Tuple2; import scala.Tuple3; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function2; -import
[3/3] spark git commit: [SPARK-19534][TESTS] Convert Java tests to use lambdas, Java 8 features
[SPARK-19534][TESTS] Convert Java tests to use lambdas, Java 8 features ## What changes were proposed in this pull request? Convert tests to use Java 8 lambdas, and modest related fixes to surrounding code. ## How was this patch tested? Jenkins tests Author: Sean Owen Closes #16964 from srowen/SPARK-19534. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1487c9af Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1487c9af Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1487c9af Branch: refs/heads/master Commit: 1487c9af20a333ead55955acf4c0aa323bea0d07 Parents: de14d35 Author: Sean Owen Authored: Sun Feb 19 09:42:50 2017 -0800 Committer: Sean Owen Committed: Sun Feb 19 09:42:50 2017 -0800 -- .../apache/spark/network/TransportContext.java | 6 +- .../spark/network/util/MapConfigProvider.java | 8 +- .../network/ChunkFetchIntegrationSuite.java | 37 +- .../network/RequestTimeoutIntegrationSuite.java | 3 +- .../network/TransportClientFactorySuite.java| 51 +- .../network/TransportResponseHandlerSuite.java | 14 +- .../network/crypto/AuthIntegrationSuite.java| 19 +- .../spark/network/sasl/SparkSaslSuite.java | 65 +-- .../util/TransportFrameDecoderSuite.java| 44 +- .../network/sasl/SaslIntegrationSuite.java | 34 +- .../ExternalShuffleBlockHandlerSuite.java | 2 +- .../shuffle/ExternalShuffleCleanupSuite.java| 6 +- .../ExternalShuffleIntegrationSuite.java| 13 +- .../shuffle/OneForOneBlockFetcherSuite.java | 78 ++- .../shuffle/RetryingBlockFetcherSuite.java | 64 ++- .../unsafe/sort/UnsafeExternalSorter.java | 1 - .../java/org/apache/spark/JavaJdbcRDDSuite.java | 26 +- .../shuffle/sort/UnsafeShuffleWriterSuite.java | 65 +-- .../map/AbstractBytesToBytesMapSuite.java | 25 +- .../unsafe/sort/UnsafeExternalSorterSuite.java | 25 +- .../test/org/apache/spark/Java8RDDAPISuite.java | 7 +- .../test/org/apache/spark/JavaAPISuite.java | 492 - .../kafka010/JavaConsumerStrategySuite.java | 24 +- .../SparkSubmitCommandBuilderSuite.java | 2 +- .../launcher/SparkSubmitOptionParserSuite.java | 8 +- .../apache/spark/ml/feature/JavaPCASuite.java | 35 +- .../classification/JavaNaiveBayesSuite.java | 10 +- .../clustering/JavaBisectingKMeansSuite.java| 4 +- .../spark/mllib/clustering/JavaLDASuite.java| 40 +- .../mllib/fpm/JavaAssociationRulesSuite.java| 6 +- .../regression/JavaLinearRegressionSuite.java | 11 +- .../spark/mllib/tree/JavaDecisionTreeSuite.java | 15 +- .../SpecificParquetRecordReaderBase.java| 2 +- .../spark/sql/Java8DatasetAggregatorSuite.java | 16 +- .../apache/spark/sql/JavaApplySchemaSuite.java | 22 +- .../apache/spark/sql/JavaDataFrameSuite.java| 47 +- .../spark/sql/JavaDatasetAggregatorSuite.java | 49 +- .../sql/JavaDatasetAggregatorSuiteBase.java | 14 +- .../org/apache/spark/sql/JavaDatasetSuite.java | 147 ++ .../test/org/apache/spark/sql/JavaUDFSuite.java | 37 +- .../spark/streaming/JavaMapWithStateSuite.java | 81 +-- .../spark/streaming/JavaReceiverAPISuite.java | 24 +- .../spark/streaming/JavaWriteAheadLogSuite.java | 10 +- .../apache/spark/streaming/Java8APISuite.java | 21 +- .../apache/spark/streaming/JavaAPISuite.java| 526 +-- 45 files changed, 662 insertions(+), 1574 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1487c9af/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java -- diff --git a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java index 37ba543..965c4ae 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java +++ b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java @@ -17,9 +17,9 @@ package org.apache.spark.network; +import java.util.ArrayList; import java.util.List; -import com.google.common.collect.Lists; import io.netty.channel.Channel; import io.netty.channel.socket.SocketChannel; import io.netty.handler.timeout.IdleStateHandler; @@ -100,7 +100,7 @@ public class TransportContext { } public TransportClientFactory createClientFactory() { -return createClientFactory(Lists.newArrayList()); +return createClientFactory(new ArrayList<>()); } /** Create a server which will attempt to bind to a specific port. */ @@ -120,7 +120,7 @@ public class TransportContext { } public TransportServer createServer() { -return createServer(0, Lists.newArr
[1/3] spark git commit: [SPARK-19534][TESTS] Convert Java tests to use lambdas, Java 8 features
Repository: spark Updated Branches: refs/heads/master de14d35f7 -> 1487c9af2 http://git-wip-us.apache.org/repos/asf/spark/blob/1487c9af/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java -- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java index a94a37c..577672c 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java @@ -96,12 +96,7 @@ public class JavaDatasetSuite implements Serializable { @Test public void testTypedFilterPreservingSchema() { Dataset ds = spark.range(10); -Dataset ds2 = ds.filter(new FilterFunction() { - @Override - public boolean call(Long value) throws Exception { -return value > 3; - } -}); +Dataset ds2 = ds.filter((FilterFunction) value -> value > 3); Assert.assertEquals(ds.schema(), ds2.schema()); } @@ -111,44 +106,28 @@ public class JavaDatasetSuite implements Serializable { Dataset ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); -Dataset filtered = ds.filter(new FilterFunction() { - @Override - public boolean call(String v) throws Exception { -return v.startsWith("h"); - } -}); +Dataset filtered = ds.filter((FilterFunction) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); -Dataset mapped = ds.map(new MapFunction() { - @Override - public Integer call(String v) throws Exception { -return v.length(); - } -}, Encoders.INT()); +Dataset mapped = ds.map((MapFunction) v -> v.length(), Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); -Dataset parMapped = ds.mapPartitions(new MapPartitionsFunction() { - @Override - public Iterator call(Iterator it) { -List ls = new LinkedList<>(); -while (it.hasNext()) { - ls.add(it.next().toUpperCase(Locale.ENGLISH)); -} -return ls.iterator(); +Dataset parMapped = ds.mapPartitions((MapPartitionsFunction) it -> { + List ls = new LinkedList<>(); + while (it.hasNext()) { +ls.add(it.next().toUpperCase(Locale.ENGLISH)); } + return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); -Dataset flatMapped = ds.flatMap(new FlatMapFunction() { - @Override - public Iterator call(String s) { -List ls = new LinkedList<>(); -for (char c : s.toCharArray()) { - ls.add(String.valueOf(c)); -} -return ls.iterator(); +Dataset flatMapped = ds.flatMap((FlatMapFunction) s -> { + List ls = new LinkedList<>(); + for (char c : s.toCharArray()) { +ls.add(String.valueOf(c)); } + return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), @@ -157,16 +136,11 @@ public class JavaDatasetSuite implements Serializable { @Test public void testForeach() { -final LongAccumulator accum = jsc.sc().longAccumulator(); +LongAccumulator accum = jsc.sc().longAccumulator(); List data = Arrays.asList("a", "b", "c"); Dataset ds = spark.createDataset(data, Encoders.STRING()); -ds.foreach(new ForeachFunction() { - @Override - public void call(String s) throws Exception { -accum.add(1); - } -}); +ds.foreach((ForeachFunction) s -> accum.add(1)); Assert.assertEquals(3, accum.value().intValue()); } @@ -175,12 +149,7 @@ public class JavaDatasetSuite implements Serializable { List data = Arrays.asList(1, 2, 3); Dataset ds = spark.createDataset(data, Encoders.INT()); -int reduced = ds.reduce(new ReduceFunction() { - @Override - public Integer call(Integer v1, Integer v2) throws Exception { -return v1 + v2; - } -}); +int reduced = ds.reduce((ReduceFunction) (v1, v2) -> v1 + v2); Assert.assertEquals(6, reduced); } @@ -189,52 +158,38 @@ public class JavaDatasetSuite implements Serializable { List data = Arrays.asList("a", "foo", "bar"); Dataset ds = spark.createDataset(data, Encoders.STRING()); KeyValueGroupedDataset grouped = ds.groupByKey( - new MapFunction() { -@Override -public Integer call(String v) throws Exception { - return v.length(); -} - }, +(MapFunction) v -> v.length(), Encoders.INT()); -Dataset mapped = grouped.mapGroups(new MapGroupsFunction() { - @Override - public String call(Integer key, Iterator values) throws Exception { -StringBuilder sb = new StringBui
[2/3] spark git commit: [SPARK-19534][TESTS] Convert Java tests to use lambdas, Java 8 features
http://git-wip-us.apache.org/repos/asf/spark/blob/1487c9af/core/src/test/java/test/org/apache/spark/JavaAPISuite.java -- diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java index 80aab10..5121491 100644 --- a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java @@ -31,7 +31,6 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.concurrent.*; import org.apache.spark.Accumulator; @@ -208,7 +207,7 @@ public class JavaAPISuite implements Serializable { assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); // Custom comparator -sortedRDD = rdd.sortByKey(Collections.reverseOrder(), false); +sortedRDD = rdd.sortByKey(Collections.reverseOrder(), false); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1)); @@ -266,13 +265,7 @@ public class JavaAPISuite implements Serializable { JavaRDD> rdd = sc.parallelize(pairs); // compare on first value -JavaRDD> sortedRDD = -rdd.sortBy(new Function, Integer>() { - @Override - public Integer call(Tuple2 t) { -return t._1(); - } -}, true, 2); +JavaRDD> sortedRDD = rdd.sortBy(Tuple2::_1, true, 2); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); List> sortedPairs = sortedRDD.collect(); @@ -280,12 +273,7 @@ public class JavaAPISuite implements Serializable { assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); // compare on second value -sortedRDD = rdd.sortBy(new Function, Integer>() { - @Override - public Integer call(Tuple2 t) { -return t._2(); - } -}, true, 2); +sortedRDD = rdd.sortBy(Tuple2::_2, true, 2); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(3, 2), sortedPairs.get(1)); @@ -294,28 +282,20 @@ public class JavaAPISuite implements Serializable { @Test public void foreach() { -final LongAccumulator accum = sc.sc().longAccumulator(); +LongAccumulator accum = sc.sc().longAccumulator(); JavaRDD rdd = sc.parallelize(Arrays.asList("Hello", "World")); -rdd.foreach(new VoidFunction() { - @Override - public void call(String s) { -accum.add(1); - } -}); +rdd.foreach(s -> accum.add(1)); assertEquals(2, accum.value().intValue()); } @Test public void foreachPartition() { -final LongAccumulator accum = sc.sc().longAccumulator(); +LongAccumulator accum = sc.sc().longAccumulator(); JavaRDD rdd = sc.parallelize(Arrays.asList("Hello", "World")); -rdd.foreachPartition(new VoidFunction>() { - @Override - public void call(Iterator iter) { -while (iter.hasNext()) { - iter.next(); - accum.add(1); -} +rdd.foreachPartition(iter -> { + while (iter.hasNext()) { +iter.next(); +accum.add(1); } }); assertEquals(2, accum.value().intValue()); @@ -361,12 +341,7 @@ public class JavaAPISuite implements Serializable { @Test public void groupBy() { JavaRDD rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); -Function isOdd = new Function() { - @Override - public Boolean call(Integer x) { -return x % 2 == 0; - } -}; +Function isOdd = x -> x % 2 == 0; JavaPairRDD> oddsAndEvens = rdd.groupBy(isOdd); assertEquals(2, oddsAndEvens.count()); assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens @@ -383,12 +358,7 @@ public class JavaAPISuite implements Serializable { // Regression test for SPARK-4459 JavaRDD rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function, Boolean> areOdd = - new Function, Boolean>() { -@Override -public Boolean call(Tuple2 x) { - return (x._1() % 2 == 0) && (x._2() % 2 == 0); -} - }; +x -> (x._1() % 2 == 0) && (x._2() % 2 == 0); JavaPairRDD pairRDD = rdd.zip(rdd); JavaPairRDD>> oddsAndEvens = pairRDD.groupBy(areOdd); assertEquals(2, oddsAndEvens.count()); @@ -406,13 +376,7 @@ public class JavaAPISuite implements Serializable { public void keyByOnPairRDD() { // Regression test for SPARK-4459 JavaRDD rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); -Function, String> sumToString = - new Function, String>() { -@Override -public String call(Tuple2 x) { - return String.valueOf(x._1() + x._2()); -} - }; +Function, String> sumToString = x -> String.valueOf(x._1() + x._2()); JavaPairRDD pairRDD = rdd.zip(rdd); JavaPairRDD>
spark-website git commit: Update Java example to use Java 8; make Scala/Python pi example consistent with better Java version; minor syntax fixes to these
Repository: spark-website Updated Branches: refs/heads/asf-site ae58782ba -> 879303593 Update Java example to use Java 8; make Scala/Python pi example consistent with better Java version; minor syntax fixes to these Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/87930359 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/87930359 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/87930359 Branch: refs/heads/asf-site Commit: 879303593efa229d416eb4178913c1c1a6f7033c Parents: ae58782 Author: Sean Owen Authored: Sun Feb 19 08:28:48 2017 -0800 Committer: Sean Owen Committed: Sun Feb 19 08:28:48 2017 -0800 -- examples.md| 57 +++-- site/examples.html | 57 +++-- 2 files changed, 44 insertions(+), 70 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/87930359/examples.md -- diff --git a/examples.md b/examples.md index 7f13e41..4a87331 100644 --- a/examples.md +++ b/examples.md @@ -61,15 +61,9 @@ counts.saveAsTextFile("hdfs://...") {% highlight java %} JavaRDD textFile = sc.textFile("hdfs://..."); -JavaRDD words = textFile.flatMap(new FlatMapFunction() { - public Iterator call(String s) { return Arrays.asList(s.split(" ")).iterator(); } -}); -JavaPairRDD pairs = words.mapToPair(new PairFunction() { - public Tuple2 call(String s) { return new Tuple2(s, 1); } -}); -JavaPairRDD counts = pairs.reduceByKey(new Function2() { - public Integer call(Integer a, Integer b) { return a + b; } -}); +JavaRDD words = textFile.flatMap(s -> Arrays.asList(s.split(" ")).iterator()) +.mapToPair(word -> new Tuple2<>(word, 1)) +.reduceByKey((a, b) -> a + b); counts.saveAsTextFile("hdfs://..."); {% endhighlight %} @@ -89,12 +83,12 @@ counts.saveAsTextFile("hdfs://..."); {% highlight python %} -def sample(p): -x, y = random(), random() -return 1 if x*x + y*y < 1 else 0 +def inside(p): +x, y = random.random(), random.random() +return x*x + y*y < 1 -count = sc.parallelize(xrange(0, NUM_SAMPLES)).map(sample) \ - .reduce(lambda a, b: a + b) +count = sc.parallelize(xrange(0, NUM_SAMPLES)) \ + .filter(inside).count() print "Pi is roughly %f" % (4.0 * count / NUM_SAMPLES) {% endhighlight %} @@ -103,12 +97,12 @@ print "Pi is roughly %f" % (4.0 * count / NUM_SAMPLES) {% highlight scala %} -val count = sc.parallelize(1 to NUM_SAMPLES).map{i => - val x = Math.random() - val y = Math.random() - if (x*x + y*y < 1) 1 else 0 -}.reduce(_ + _) -println("Pi is roughly " + 4.0 * count / NUM_SAMPLES) +val count = sc.parallelize(1 to NUM_SAMPLES).filter { _ => + val x = math.random + val y = math.random + x*x + y*y < 1 +}.count() +println(s"Pi is roughly ${4.0 * count / NUM_SAMPLES}") {% endhighlight %} @@ -116,17 +110,15 @@ println("Pi is roughly " + 4.0 * count / NUM_SAMPLES) {% highlight java %} -List l = new ArrayList(NUM_SAMPLES); +List l = new ArrayList<>(NUM_SAMPLES); for (int i = 0; i < NUM_SAMPLES; i++) { l.add(i); } -long count = sc.parallelize(l).filter(new Function() { - public Boolean call(Integer i) { -double x = Math.random(); -double y = Math.random(); -return x*x + y*y < 1; - } +long count = sc.parallelize(l).filter(i -> { + double x = Math.random(); + double y = Math.random(); + return x*x + y*y < 1; }).count(); System.out.println("Pi is roughly " + 4.0 * count / NUM_SAMPLES); {% endhighlight %} @@ -194,14 +186,9 @@ errors.filter(col("line").like("%MySQL%")).collect() {% highlight java %} // Creates a DataFrame having a single column named "line" JavaRDD textFile = sc.textFile("hdfs://..."); -JavaRDD rowRDD = textFile.map( - new Function() { -public Row call(String line) throws Exception { - return RowFactory.create(line); -} - }); -List fields = new ArrayList(); -fields.add(DataTypes.createStructField("line", DataTypes.StringType, true)); +JavaRDD rowRDD = textFile.map(RowFactory::create); +List fields = Arrays.asList( + DataTypes.createStructField("line", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); DataFrame df = sqlContext.createDataFrame(rowRDD, schema); http://git-wip-us.apache.org/repos/asf/spark-website/blob/87930359/site/examples.html -- diff --git a/site/examples.html b/site/examples.html index bfff52d..05ec479 100644 --- a/site/examples.html +++ b/site/examples.html @@ -247,15 +247,9 @@ In this page, we will show examples using RDD API as well as examples using high JavaRDDtex
spark git commit: [SPARK-19646][CORE][STREAMING] binaryRecords replicates records in scala API
Repository: spark Updated Branches: refs/heads/master 776b8f17c -> d0ecca607 [SPARK-19646][CORE][STREAMING] binaryRecords replicates records in scala API ## What changes were proposed in this pull request? Use `BytesWritable.copyBytes`, not `getBytes`, because `getBytes` returns the underlying array, which may be reused when repeated reads don't need a different size, as is the case with binaryRecords APIs ## How was this patch tested? Existing tests Author: Sean Owen Closes #16974 from srowen/SPARK-19646. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0ecca60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0ecca60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0ecca60 Branch: refs/heads/master Commit: d0ecca6075d86bedebf8bc2278085a2cd6cb0a43 Parents: 776b8f1 Author: Sean Owen Authored: Mon Feb 20 09:02:09 2017 -0800 Committer: Sean Owen Committed: Mon Feb 20 09:02:09 2017 -0800 -- .../scala/org/apache/spark/SparkContext.scala | 5 +- .../test/scala/org/apache/spark/FileSuite.scala | 178 --- .../spark/streaming/StreamingContext.scala | 5 +- .../spark/streaming/InputStreamsSuite.scala | 21 +-- 4 files changed, 53 insertions(+), 156 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecca60/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index e4d8389..17194b9 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -961,12 +961,11 @@ class SparkContext(config: SparkConf) extends Logging { classOf[LongWritable], classOf[BytesWritable], conf = conf) -val data = br.map { case (k, v) => - val bytes = v.getBytes +br.map { case (k, v) => + val bytes = v.copyBytes() assert(bytes.length == recordLength, "Byte array does not have correct length") bytes } -data } /** http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecca60/core/src/test/scala/org/apache/spark/FileSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index 6538507..a2d3177 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark import java.io._ +import java.nio.ByteBuffer import java.util.zip.GZIPOutputStream import scala.io.Source @@ -30,7 +31,6 @@ import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat} import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat} -import org.apache.spark.input.PortableDataStream import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD} import org.apache.spark.storage.StorageLevel @@ -237,24 +237,26 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { assert(output.map(_.toString).collect().toList === List("(1,a)", "(2,aa)", "(3,aaa)")) } - test("binary file input as byte array") { -sc = new SparkContext("local", "test") + private def writeBinaryData(testOutput: Array[Byte], testOutputCopies: Int): File = { val outFile = new File(tempDir, "record-bytestream-0.bin") -val outFileName = outFile.getAbsolutePath() - -// create file -val testOutput = Array[Byte](1, 2, 3, 4, 5, 6) -val bbuf = java.nio.ByteBuffer.wrap(testOutput) -// write data to file -val file = new java.io.FileOutputStream(outFile) +val file = new FileOutputStream(outFile) val channel = file.getChannel -channel.write(bbuf) +for (i <- 0 until testOutputCopies) { + // Shift values by i so that they're different in the output + val alteredOutput = testOutput.map(b => (b + i).toByte) + channel.write(ByteBuffer.wrap(alteredOutput)) +} channel.close() file.close() +outFile + } -val inRdd = sc.binaryFiles(outFileName) -val (infile: String, indata: PortableDataStream) = inRdd.collect.head - + test("binary file input as byte array") { +sc = new SparkContext("local", "test") +val testOutput = Array[Byte](1, 2, 3, 4, 5, 6) +val outFile = writeBinaryData(testOutput, 1) +val inRdd =
spark git commit: [SPARK-19646][CORE][STREAMING] binaryRecords replicates records in scala API
Repository: spark Updated Branches: refs/heads/branch-2.1 b083ec511 -> 7c371dec1 [SPARK-19646][CORE][STREAMING] binaryRecords replicates records in scala API ## What changes were proposed in this pull request? Use `BytesWritable.copyBytes`, not `getBytes`, because `getBytes` returns the underlying array, which may be reused when repeated reads don't need a different size, as is the case with binaryRecords APIs ## How was this patch tested? Existing tests Author: Sean Owen Closes #16974 from srowen/SPARK-19646. (cherry picked from commit d0ecca6075d86bedebf8bc2278085a2cd6cb0a43) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7c371dec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7c371dec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7c371dec Branch: refs/heads/branch-2.1 Commit: 7c371dec1c406831cdea86c7309960e08ddf2c36 Parents: b083ec5 Author: Sean Owen Authored: Mon Feb 20 09:02:09 2017 -0800 Committer: Sean Owen Committed: Mon Feb 20 09:02:18 2017 -0800 -- .../scala/org/apache/spark/SparkContext.scala | 5 +- .../test/scala/org/apache/spark/FileSuite.scala | 178 --- .../spark/streaming/StreamingContext.scala | 5 +- .../spark/streaming/InputStreamsSuite.scala | 21 +-- 4 files changed, 53 insertions(+), 156 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7c371dec/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 11ad442..2db48f6 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -944,12 +944,11 @@ class SparkContext(config: SparkConf) extends Logging { classOf[LongWritable], classOf[BytesWritable], conf = conf) -val data = br.map { case (k, v) => - val bytes = v.getBytes +br.map { case (k, v) => + val bytes = v.copyBytes() assert(bytes.length == recordLength, "Byte array does not have correct length") bytes } -data } /** http://git-wip-us.apache.org/repos/asf/spark/blob/7c371dec/core/src/test/scala/org/apache/spark/FileSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index cc52bb1..0276575 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark import java.io._ +import java.nio.ByteBuffer import java.util.zip.GZIPOutputStream import scala.io.Source @@ -29,7 +30,6 @@ import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat} import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat} -import org.apache.spark.input.PortableDataStream import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD} import org.apache.spark.storage.StorageLevel @@ -231,24 +231,26 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { assert(output.map(_.toString).collect().toList === List("(1,a)", "(2,aa)", "(3,aaa)")) } - test("binary file input as byte array") { -sc = new SparkContext("local", "test") + private def writeBinaryData(testOutput: Array[Byte], testOutputCopies: Int): File = { val outFile = new File(tempDir, "record-bytestream-0.bin") -val outFileName = outFile.getAbsolutePath() - -// create file -val testOutput = Array[Byte](1, 2, 3, 4, 5, 6) -val bbuf = java.nio.ByteBuffer.wrap(testOutput) -// write data to file -val file = new java.io.FileOutputStream(outFile) +val file = new FileOutputStream(outFile) val channel = file.getChannel -channel.write(bbuf) +for (i <- 0 until testOutputCopies) { + // Shift values by i so that they're different in the output + val alteredOutput = testOutput.map(b => (b + i).toByte) + channel.write(ByteBuffer.wrap(alteredOutput)) +} channel.close() file.close() +outFile + } -val inRdd = sc.binaryFiles(outFileName) -val (infile: String, indata: PortableDataStream) = inRdd.collect.head - + test("binary file input as byte array") { +sc = new SparkContext("local", "test") +val test
spark git commit: [SPARK-19646][CORE][STREAMING] binaryRecords replicates records in scala API
Repository: spark Updated Branches: refs/heads/branch-2.0 5c3e56fd2 -> ddd432de2 [SPARK-19646][CORE][STREAMING] binaryRecords replicates records in scala API Use `BytesWritable.copyBytes`, not `getBytes`, because `getBytes` returns the underlying array, which may be reused when repeated reads don't need a different size, as is the case with binaryRecords APIs Existing tests Author: Sean Owen Closes #16974 from srowen/SPARK-19646. (cherry picked from commit d0ecca6075d86bedebf8bc2278085a2cd6cb0a43) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ddd432de Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ddd432de Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ddd432de Branch: refs/heads/branch-2.0 Commit: ddd432de2b5041138a458aae9f5117a3f41d835e Parents: 5c3e56f Author: Sean Owen Authored: Mon Feb 20 09:02:09 2017 -0800 Committer: Sean Owen Committed: Mon Feb 20 09:19:14 2017 -0800 -- .../scala/org/apache/spark/SparkContext.scala | 5 +- .../test/scala/org/apache/spark/FileSuite.scala | 182 --- .../spark/streaming/StreamingContext.scala | 5 +- .../spark/streaming/InputStreamsSuite.scala | 21 ++- 4 files changed, 55 insertions(+), 158 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ddd432de/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 82e754b..2abe444 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -922,12 +922,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli classOf[LongWritable], classOf[BytesWritable], conf = conf) -val data = br.map { case (k, v) => - val bytes = v.getBytes +br.map { case (k, v) => + val bytes = v.copyBytes() assert(bytes.length == recordLength, "Byte array does not have correct length") bytes } -data } /** http://git-wip-us.apache.org/repos/asf/spark/blob/ddd432de/core/src/test/scala/org/apache/spark/FileSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index 993834f..98e9f8c 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark -import java.io.{File, FileWriter} +import java.io._ +import java.nio.ByteBuffer import scala.io.Source @@ -28,7 +29,6 @@ import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat} import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat} -import org.apache.spark.input.PortableDataStream import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD} import org.apache.spark.storage.StorageLevel import org.apache.spark.util.Utils @@ -229,184 +229,82 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { assert(output.map(_.toString).collect().toList === List("(1,a)", "(2,aa)", "(3,aaa)")) } - test("binary file input as byte array") { -sc = new SparkContext("local", "test") + private def writeBinaryData(testOutput: Array[Byte], testOutputCopies: Int): File = { val outFile = new File(tempDir, "record-bytestream-0.bin") -val outFileName = outFile.getAbsolutePath() - -// create file -val testOutput = Array[Byte](1, 2, 3, 4, 5, 6) -val bbuf = java.nio.ByteBuffer.wrap(testOutput) -// write data to file -val file = new java.io.FileOutputStream(outFile) +val file = new FileOutputStream(outFile) val channel = file.getChannel -channel.write(bbuf) +for (i <- 0 until testOutputCopies) { + // Shift values by i so that they're different in the output + val alteredOutput = testOutput.map(b => (b + i).toByte) + channel.write(ByteBuffer.wrap(alteredOutput)) +} channel.close() file.close() +outFile + } -val inRdd = sc.binaryFiles(outFileName) -val (infile: String, indata: PortableDataStream) = inRdd.collect.head - + test("binary file input as byte array") { +sc = new SparkContext("local", "test") +val testOutput = Array[Byte](1, 2, 3, 4, 5, 6) +val outFile = writeBinaryData(testOutput, 1
spark git commit: [SPARK-19646][BUILD][HOTFIX] Fix compile error from cherry-pick of SPARK-19646 into branch 2.1
Repository: spark Updated Branches: refs/heads/branch-2.1 7c371dec1 -> c3316743e [SPARK-19646][BUILD][HOTFIX] Fix compile error from cherry-pick of SPARK-19646 into branch 2.1 ## What changes were proposed in this pull request? Fix compile error from cherry-pick of SPARK-19646 into branch 2.1 ## How was this patch tested? Jenkins tests Author: Sean Owen Closes #17003 from srowen/SPARK-19646.2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c3316743 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c3316743 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c3316743 Branch: refs/heads/branch-2.1 Commit: c3316743e676369ed8ce68fec5b28050a5a28d15 Parents: 7c371de Author: Sean Owen Authored: Mon Feb 20 12:19:54 2017 -0800 Committer: Sean Owen Committed: Mon Feb 20 12:19:54 2017 -0800 -- core/src/test/scala/org/apache/spark/FileSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c3316743/core/src/test/scala/org/apache/spark/FileSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index 0276575..467a16d 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -252,7 +252,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { val inRdd = sc.binaryFiles(outFile.getAbsolutePath) val (infile, indata) = inRdd.collect().head // Make sure the name and array match -assert(infile.contains(outFileName)) // a prefix may get added +assert(infile.contains(outFile.getAbsolutePath)) // a prefix may get added assert(indata.toArray === testOutput) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19508][CORE] Improve error message when binding service fails
Repository: spark Updated Branches: refs/heads/master 73f065569 -> 339419145 [SPARK-19508][CORE] Improve error message when binding service fails ## What changes were proposed in this pull request? Utils provides a helper function to bind service on port. This function can bind the service to a random free port. However, if the binding fails on a random free port, the retrying and final exception messages look confusing. 17/02/06 16:25:43 WARN Utils: Service 'sparkDriver' could not bind on port 0. Attempting port 1. 17/02/06 16:25:43 WARN Utils: Service 'sparkDriver' could not bind on port 0. Attempting port 1. 17/02/06 16:25:43 WARN Utils: Service 'sparkDriver' could not bind on port 0. Attempting port 1. 17/02/06 16:25:43 WARN Utils: Service 'sparkDriver' could not bind on port 0. Attempting port 1. 17/02/06 16:25:43 WARN Utils: Service 'sparkDriver' could not bind on port 0. Attempting port 1. ... 17/02/06 16:25:43 ERROR SparkContext: Error initializing SparkContext. java.net.BindException: Can't assign requested address: Service 'sparkDriver' failed after 16 retries (starting from 0)! Consider explicitly setting the appropriate port for the service 'sparkDriver' (for example spark.ui.port for SparkUI) to an available port or increasing spark.port.maxRetries. ## How was this patch tested? Jenkins tests. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Liang-Chi Hsieh Closes #16851 from viirya/better-log-message. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33941914 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33941914 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33941914 Branch: refs/heads/master Commit: 33941914548cc5a65e8467821745d65728176368 Parents: 73f0655 Author: Liang-Chi Hsieh Authored: Mon Feb 20 21:25:21 2017 -0800 Committer: Sean Owen Committed: Mon Feb 20 21:25:21 2017 -0800 -- .../scala/org/apache/spark/util/Utils.scala | 27 +++- 1 file changed, 21 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/33941914/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 1e6e9a2..5538289 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2210,17 +2210,32 @@ private[spark] object Utils extends Logging { } catch { case e: Exception if isBindCollision(e) => if (offset >= maxRetries) { -val exceptionMessage = s"${e.getMessage}: Service$serviceString failed after " + - s"$maxRetries retries (starting from $startPort)! Consider explicitly setting " + - s"the appropriate port for the service$serviceString (for example spark.ui.port " + - s"for SparkUI) to an available port or increasing spark.port.maxRetries." +val exceptionMessage = if (startPort == 0) { + s"${e.getMessage}: Service$serviceString failed after " + +s"$maxRetries retries (on a random free port)! " + +s"Consider explicitly setting the appropriate binding address for " + +s"the service$serviceString (for example spark.driver.bindAddress " + +s"for SparkDriver) to the correct binding address." +} else { + s"${e.getMessage}: Service$serviceString failed after " + +s"$maxRetries retries (starting from $startPort)! Consider explicitly setting " + +s"the appropriate port for the service$serviceString (for example spark.ui.port " + +s"for SparkUI) to an available port or increasing spark.port.maxRetries." +} val exception = new BindException(exceptionMessage) // restore original stack trace exception.setStackTrace(e.getStackTrace) throw exception } - logWarning(s"Service$serviceString could not bind on port $tryPort. " + -s"Attempting port ${tryPort + 1}.") + if (startPort == 0) { +// As startPort 0 is for a random free port, it is most possibly binding address is +// not correct. +logWarning(s"Service$serviceString could not bind on a random free port. " + + "You may check whether configuring an appropriate binding address.") + } else { +logWarning(s"Service$serviceString could not bind on port $tryPort. " + + s"Attempting port ${tryPort + 1}.") +
spark git commit: [SPARK-18922][TESTS] Fix new test failures on Windows due to path and resource not closed
Repository: spark Updated Branches: refs/heads/master 339419145 -> 17b93b5fe [SPARK-18922][TESTS] Fix new test failures on Windows due to path and resource not closed ## What changes were proposed in this pull request? This PR proposes to fix new test failures on WIndows as below: **Before** ``` KafkaRelationSuite: - test late binding start offsets *** FAILED *** (7 seconds, 679 milliseconds) Cause: java.nio.file.FileSystemException: C:\projects\spark\target\tmp\spark-4c4b0cd1-4cb7-4908-949d-1b0cc8addb50\topic-4-0\.log -> C:\projects\spark\target\tmp\spark-4c4b0cd1-4cb7-4908-949d-1b0cc8addb50\topic-4-0\.log.deleted: The process cannot access the file because it is being used by another process. KafkaSourceSuite: - deserialization of initial offset with Spark 2.1.0 *** FAILED *** (3 seconds, 542 milliseconds) java.io.IOException: Failed to delete: C:\projects\spark\target\tmp\spark-97ef64fc-ae61-4ce3-ac59-287fd38bd824 - deserialization of initial offset written by Spark 2.1.0 *** FAILED *** (60 milliseconds) java.nio.file.InvalidPathException: Illegal char <:> at index 2: /C:/projects/spark/external/kafka-0-10-sql/target/scala-2.11/test-classes/kafka-source-initial-offset-version-2.1.0.b HiveDDLSuite: - partitioned table should always put partition columns at the end of table schema *** FAILED *** (657 milliseconds) org.apache.spark.sql.AnalysisException: Path does not exist: file:/C:projectsspark arget mpspark-f1b83d09-850a-4bba-8e43-a2a28dfaa757; DDLSuite: - create a data source table without schema *** FAILED *** (94 milliseconds) org.apache.spark.sql.AnalysisException: Path does not exist: file:/C:projectsspark arget mpspark-a3f3c161-afae-4d6f-9182-e8642f77062b; - SET LOCATION for managed table *** FAILED *** (219 milliseconds) org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree: Exchange SinglePartit +- *HashAggregate(keys=[], functions=[partial_count(1)], output=[count#99367L]) +- *FileScan parquet default.tbl[] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/C:projectssparkarget mpspark-15be2f2f-4ea9-4c47-bfee-1b7b49363033], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<> - insert data to a data source table which has a not existed location should succeed *** FAILED *** (16 milliseconds) org.apache.spark.sql.AnalysisException: Path does not exist: file:/C:projectsspark arget mpspark-34987671-e8d1-4624-ba5b-db1012e1246b; - insert into a data source table with no existed partition location should succeed *** FAILED *** (16 milliseconds) org.apache.spark.sql.AnalysisException: Path does not exist: file:/C:projectsspark arget mpspark-4c6ccfbf-4091-4032-9fbc-3d40c58267d5; - read data from a data source table which has a not existed location should succeed *** FAILED *** (0 milliseconds) - read data from a data source table with no existed partition location should succeed *** FAILED *** (0 milliseconds) org.apache.spark.sql.AnalysisException: Path does not exist: file:/C:projectsspark arget mpspark-6af39e37-abd1-44e8-ac68-e2dfcf67a2f3; InputOutputMetricsSuite: - output metrics on records written *** FAILED *** (0 milliseconds) java.lang.IllegalArgumentException: Wrong FS: file://C:\projects\spark\target\tmp\spark-cd69ee77-88f2-4202-bed6-19c0ee05ef55\InputOutputMetricsSuite, expected: file:/// - output metrics on records written - new Hadoop API *** FAILED *** (16 milliseconds) java.lang.IllegalArgumentException: Wrong FS: file://C:\projects\spark\target\tmp\spark-b69e8fcb-047b-4de8-9cdf-5f026efb6762\InputOutputMetricsSuite, expected: file:/// ``` **After** ``` KafkaRelationSuite: - test late binding start offsets !!! CANCELED !!! (62 milliseconds) KafkaSourceSuite: - deserialization of initial offset with Spark 2.1.0 (5 seconds, 341 milliseconds) - deserialization of initial offset written by Spark 2.1.0 (910 milliseconds) HiveDDLSuite: - partitioned table should always put partition columns at the end of table schema (2 seconds) DDLSuite: - create a data source table without schema (828 milliseconds) - SET LOCATION for managed table (406 milliseconds) - insert data to a data source table which has a not existed location should succeed (406 milliseconds) - insert into a data source table with no existed partition location should succeed (453 milliseconds) - read data from a data source table which has a not existed location should succeed (94 milliseconds) - read data from a data source table with no existed partition location should succeed (265 milliseconds) InputOutputMetricsSuite: - output metrics on records written (172 milliseconds) - output metrics on records written - new Hadoop API (297 milliseconds) ``` ## How was this patch tested? Fixed tests in `InputOutputMetricsSuite`, `KafkaRelationSuite`, `KafkaSourceSuite`, `DDLSuite.scala` and `HiveDDLSu
spark-website git commit: Add instructions for running individual tests.
Repository: spark-website Updated Branches: refs/heads/asf-site 879303593 -> ca64fac2e Add instructions for running individual tests. This is useful and I often forget how to do it. I learned some new tricks when @squito gave @jinxing64 some tips on how to do this, so I thought it was worth adding this to the website. Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/ca64fac2 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/ca64fac2 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/ca64fac2 Branch: refs/heads/asf-site Commit: ca64fac2e24256dc3a07711e004c540b892965fe Parents: 8793035 Author: Kay Ousterhout Authored: Sat Feb 11 18:38:46 2017 -0800 Committer: Sean Owen Committed: Wed Feb 22 05:48:18 2017 -0800 -- developer-tools.md| 81 +- site/developer-tools.html | 72 - 2 files changed, 151 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/ca64fac2/developer-tools.md -- diff --git a/developer-tools.md b/developer-tools.md index e8853b8..88f3f36 100644 --- a/developer-tools.md +++ b/developer-tools.md @@ -9,7 +9,9 @@ navigation: Useful Developer Tools -Reducing Build Times +Reducing Build Times + +SBT: Avoiding Re-Creating the Assembly JAR Spark's default build strategy is to assemble a jar including all of its dependencies. This can be cumbersome when doing iterative development. When developing locally, it is possible to create @@ -32,6 +34,83 @@ $ ./bin/spark-shell $ build/sbt ~compile ``` +Maven: Speeding up Compilation with Zinc + +[Zinc](https://github.com/typesafehub/zinc) is a long-running server version of SBT's incremental +compiler. When run locally as a background process, it speeds up builds of Scala-based projects +like Spark. Developers who regularly recompile Spark with Maven will be the most interested in +Zinc. The project site gives instructions for building and running `zinc`; OS X users can +install it using `brew install zinc`. + +If using the `build/mvn` package `zinc` will automatically be downloaded and leveraged for all +builds. This process will auto-start after the first time `build/mvn` is called and bind to port +3030 unless the `ZINC_PORT` environment variable is set. The `zinc` process can subsequently be +shut down at any time by running `build/zinc-/bin/zinc -shutdown` and will automatically +restart whenever `build/mvn` is called. + +Running Individual Tests + +When developing locally, it's often convenient to run a single test or a few tests, rather than running the entire test suite. + +Testing with SBT + +The fastest way to run individual tests is to use the `sbt` console. It's fastest to keep a `sbt` console open, and use it to re-run tests as necessary. For example, to run all of the tests in a particular project, e.g., `core`: + +``` +$ build/sbt +> project core +> test +``` + +You can run a single test suite using the `testOnly` command. For example, to run the DAGSchedulerSuite: + +``` +> testOnly org.apache.spark.scheduler.DAGSchedulerSuite +``` + +The `testOnly` command accepts wildcards; e.g., you can also run the `DAGSchedulerSuite` with: + +``` +> testOnly *DAGSchedulerSuite +``` + +Or you could run all of the tests in the scheduler package: + +``` +> testOnly org.apache.spark.scheduler.* +``` + +If you'd like to run just a single test in the `DAGSchedulerSuite`, e.g., a test that includes "SPARK-12345" in the name, you run the following command in the sbt console: + +``` +> testOnly *DAGSchedulerSuite -- -z "SPARK-12345" +``` + +If you'd prefer, you can run all of these commands on the command line (but this will be slower than running tests using an open cosole). To do this, you need to surround `testOnly` and the following arguments in quotes: + +``` +$ build/sbt "core/testOnly *DAGSchedulerSuite -- -z SPARK-12345" +``` + +For more about how to run individual tests with sbt, see the [sbt documentation](http://www.scala-sbt.org/0.13/docs/Testing.html). + + +Testing with Maven + +With Maven, you can use the `-DwildcardSuites` flag to run individual Scala tests: + +``` +build/mvn -Dtest=none -DwildcardSuites=org.apache.spark.scheduler.DAGSchedulerSuite test +``` + +You need `-Dtest=none` to avoid running the Java tests. For more information about the ScalaTest Maven Plugin, refer to the [ScalaTest documentation](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). + +To run individual Java tests, you can use the `-Dtest` flag: + +``` +build/mvn test -DwildcardSuites=none -Dtest=org.apache.spark.streaming.JavaAPISuite test +``` + Checking Out Pull Re
spark-website git commit: Fix last update to Java pi example
Repository: spark-website Updated Branches: refs/heads/asf-site ca64fac2e -> 470b7ed51 Fix last update to Java pi example Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/470b7ed5 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/470b7ed5 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/470b7ed5 Branch: refs/heads/asf-site Commit: 470b7ed51a112cdefd2ec6ee3a342b3956e05189 Parents: ca64fac Author: Sean Owen Authored: Wed Feb 22 05:10:01 2017 -0800 Committer: Sean Owen Committed: Wed Feb 22 05:52:43 2017 -0800 -- examples.md| 7 --- site/examples.html | 7 --- 2 files changed, 8 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/470b7ed5/examples.md -- diff --git a/examples.md b/examples.md index 4a87331..2d1dbaa 100644 --- a/examples.md +++ b/examples.md @@ -61,9 +61,10 @@ counts.saveAsTextFile("hdfs://...") {% highlight java %} JavaRDD textFile = sc.textFile("hdfs://..."); -JavaRDD words = textFile.flatMap(s -> Arrays.asList(s.split(" ")).iterator()) -.mapToPair(word -> new Tuple2<>(word, 1)) -.reduceByKey((a, b) -> a + b); +JavaPairRDD counts = textFile +.flatMap(s -> Arrays.asList(s.split(" ")).iterator()) +.mapToPair(word -> new Tuple2<>(word, 1)) +.reduceByKey((a, b) -> a + b); counts.saveAsTextFile("hdfs://..."); {% endhighlight %} http://git-wip-us.apache.org/repos/asf/spark-website/blob/470b7ed5/site/examples.html -- diff --git a/site/examples.html b/site/examples.html index 05ec479..a19e263 100644 --- a/site/examples.html +++ b/site/examples.html @@ -247,9 +247,10 @@ In this page, we will show examples using RDD API as well as examples using high JavaRDDtextFile = sc.textFile("hdfs://..."); -JavaRDD words = textFile.flatMap(s -> Arrays.asList(s.split(" ")).iterator()) -.mapToPair(word -> new Tuple2<>(word, 1)) -.reduceByKey((a, b) -> a + b); +JavaPairRDD counts = textFile +.flatMap(s -> Arrays.asList(s.split(" ")).iterator()) +.mapToPair(word -> new Tuple2<>(word, 1)) +.reduceByKey((a, b) -> a + b); counts.saveAsTextFile("hdfs://..."); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] Fix few typos in structured streaming doc
Repository: spark Updated Branches: refs/heads/master fa7c582e9 -> 1b9ba258e [MINOR][DOCS] Fix few typos in structured streaming doc ## What changes were proposed in this pull request? Minor typo in `even-time`, which is changed to `event-time` and a couple of grammatical errors fix. ## How was this patch tested? N/A - since this is a doc fix. I did a jekyll build locally though. Author: Ramkumar Venkataraman Closes #17037 from ramkumarvenkat/doc-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b9ba258 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b9ba258 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b9ba258 Branch: refs/heads/master Commit: 1b9ba258e086e2ba89a4f35a54106e2f8a38b525 Parents: fa7c582 Author: Ramkumar Venkataraman Authored: Sat Feb 25 02:18:22 2017 + Committer: Sean Owen Committed: Sat Feb 25 02:18:22 2017 + -- docs/structured-streaming-programming-guide.md | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1b9ba258/docs/structured-streaming-programming-guide.md -- diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index ad3b2fb..6af47b6 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -392,7 +392,7 @@ data, thus relieving the users from reasoning about it. As an example, letâs see how this model handles event-time based processing and late arriving data. ## Handling Event-time and Late Data -Event-time is the time embedded in the data itself. For many applications, you may want to operate on this event-time. For example, if you want to get the number of events generated by IoT devices every minute, then you probably want to use the time when the data was generated (that is, event-time in the data), rather than the time Spark receives them. This event-time is very naturally expressed in this model -- each event from the devices is a row in the table, and event-time is a column value in the row. This allows window-based aggregations (e.g. number of events every minute) to be just a special type of grouping and aggregation on the even-time column -- each time window is a group and each row can belong to multiple windows/groups. Therefore, such event-time-window-based aggregation queries can be defined consistently on both a static dataset (e.g. from collected device events logs) as well as on a data stream, making the life of the user much easier. +Event-time is the time embedded in the data itself. For many applications, you may want to operate on this event-time. For example, if you want to get the number of events generated by IoT devices every minute, then you probably want to use the time when the data was generated (that is, event-time in the data), rather than the time Spark receives them. This event-time is very naturally expressed in this model -- each event from the devices is a row in the table, and event-time is a column value in the row. This allows window-based aggregations (e.g. number of events every minute) to be just a special type of grouping and aggregation on the event-time column -- each time window is a group and each row can belong to multiple windows/groups. Therefore, such event-time-window-based aggregation queries can be defined consistently on both a static dataset (e.g. from collected device events logs) as well as on a data stream, making the life of the user much easier. Furthermore, this model naturally handles data that has arrived later than expected based on its event-time. Since Spark is updating the Result Table, @@ -401,7 +401,7 @@ as well as cleaning up old aggregates to limit the size of intermediate state data. Since Spark 2.1, we have support for watermarking which allows the user to specify the threshold of late data, and allows the engine to accordingly clean up old state. These are explained later in more -details in the [Window Operations](#window-operations-on-event-time) section. +detail in the [Window Operations](#window-operations-on-event-time) section. ## Fault Tolerance Semantics Delivering end-to-end exactly-once semantics was one of key goals behind the design of Structured Streaming. To achieve that, we have designed the Structured Streaming sources, the sinks and the execution engine to reliably track the exact progress of the processing so that it can handle any kind of failure by restarting and/or reprocessing. Every streaming source is assumed to have offsets (similar to Kafka offsets, or Kinesis sequence numbers) @@ -647,7 +647,7 @@ df.groupBy("deviceType").count() #
spark git commit: [SPARK-19673][SQL] "ThriftServer default app name is changed wrong"
Repository: spark Updated Branches: refs/heads/master 061bcfb86 -> fe07de956 [SPARK-19673][SQL] "ThriftServer default app name is changed wrong" ## What changes were proposed in this pull request? In spark 1.x ,the name of ThriftServer is SparkSQL:localHostName. While the ThriftServer default name is changed to the className of HiveThfift2 , which is not appropriate. ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: lvdongr Closes #17010 from lvdongr/ThriftserverName. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe07de95 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe07de95 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe07de95 Branch: refs/heads/master Commit: fe07de9566b345c7ad6a985bf1fdf1062351f6cf Parents: 061bcfb Author: lvdongr Authored: Sat Feb 25 21:47:02 2017 + Committer: Sean Owen Committed: Sat Feb 25 21:47:02 2017 + -- .../scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fe07de95/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala -- diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala index 78a3094..c0b2994 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala @@ -40,6 +40,7 @@ private[hive] object SparkSQLEnv extends Logging { val maybeAppName = sparkConf .getOption("spark.app.name") .filterNot(_ == classOf[SparkSQLCLIDriver].getName) +.filterNot(_ == classOf[HiveThriftServer2].getName) sparkConf .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}")) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15288][MESOS] Mesos dispatcher should handle gracefully when any thread gets UncaughtException
Repository: spark Updated Branches: refs/heads/master fe07de956 -> 410392ed7 [SPARK-15288][MESOS] Mesos dispatcher should handle gracefully when any thread gets UncaughtException ## What changes were proposed in this pull request? Adding the default UncaughtExceptionHandler to the MesosClusterDispatcher. ## How was this patch tested? I verified it manually, when any of the dispatcher thread gets uncaught exceptions then the default UncaughtExceptionHandler will handle those exceptions. Author: Devaraj K Closes #13072 from devaraj-kavali/SPARK-15288. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/410392ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/410392ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/410392ed Branch: refs/heads/master Commit: 410392ed75da64c6980fad5b450b352ee8377cb8 Parents: fe07de9 Author: Devaraj K Authored: Sat Feb 25 21:48:41 2017 + Committer: Sean Owen Committed: Sat Feb 25 21:48:41 2017 + -- .../org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala| 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/410392ed/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala -- diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala index 792ade8..38b082a 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala @@ -25,7 +25,7 @@ import org.apache.spark.deploy.mesos.ui.MesosClusterUI import org.apache.spark.deploy.rest.mesos.MesosRestServer import org.apache.spark.internal.Logging import org.apache.spark.scheduler.cluster.mesos._ -import org.apache.spark.util.{CommandLineUtils, ShutdownHookManager, Utils} +import org.apache.spark.util.{CommandLineUtils, ShutdownHookManager, SparkUncaughtExceptionHandler, Utils} /* * A dispatcher that is responsible for managing and launching drivers, and is intended to be @@ -97,6 +97,7 @@ private[mesos] object MesosClusterDispatcher with CommandLineUtils { override def main(args: Array[String]) { +Thread.setDefaultUncaughtExceptionHandler(SparkUncaughtExceptionHandler) Utils.initDaemon(log) val conf = new SparkConf val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][BUILD] Fix lint-java breaks in Java
Repository: spark Updated Branches: refs/heads/master 9f8e39215 -> 4ba9c6c45 [MINOR][BUILD] Fix lint-java breaks in Java ## What changes were proposed in this pull request? This PR proposes to fix the lint-breaks as below: ``` [ERROR] src/test/java/org/apache/spark/network/TransportResponseHandlerSuite.java:[29,8] (imports) UnusedImports: Unused import - org.apache.spark.network.buffer.ManagedBuffer. [ERROR] src/main/java/org/apache/spark/unsafe/types/UTF8String.java:[156,10] (modifier) ModifierOrder: 'Nonnull' annotation modifier does not precede non-annotation modifiers. [ERROR] src/main/java/org/apache/spark/SparkFirehoseListener.java:[122] (sizes) LineLength: Line is longer than 100 characters (found 105). [ERROR] src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java:[164,78] (coding) OneStatementPerLine: Only one statement per line allowed. [ERROR] src/test/java/test/org/apache/spark/JavaAPISuite.java:[1157] (sizes) LineLength: Line is longer than 100 characters (found 121). [ERROR] src/test/java/org/apache/spark/streaming/JavaMapWithStateSuite.java:[149] (sizes) LineLength: Line is longer than 100 characters (found 113). [ERROR] src/test/java/test/org/apache/spark/streaming/Java8APISuite.java:[146] (sizes) LineLength: Line is longer than 100 characters (found 122). [ERROR] src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java:[32,8] (imports) UnusedImports: Unused import - org.apache.spark.streaming.Time. [ERROR] src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java:[611] (sizes) LineLength: Line is longer than 100 characters (found 101). [ERROR] src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java:[1317] (sizes) LineLength: Line is longer than 100 characters (found 102). [ERROR] src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java:[91] (sizes) LineLength: Line is longer than 100 characters (found 102). [ERROR] src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java:[113] (sizes) LineLength: Line is longer than 100 characters (found 101). [ERROR] src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java:[164] (sizes) LineLength: Line is longer than 100 characters (found 110). [ERROR] src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java:[212] (sizes) LineLength: Line is longer than 100 characters (found 114). [ERROR] src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java:[36] (sizes) LineLength: Line is longer than 100 characters (found 101). [ERROR] src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java:[26,8] (imports) UnusedImports: Unused import - com.amazonaws.regions.RegionUtils. [ERROR] src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java:[20,8] (imports) UnusedImports: Unused import - com.amazonaws.regions.RegionUtils. [ERROR] src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java:[94] (sizes) LineLength: Line is longer than 100 characters (found 103). [ERROR] src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java:[30,8] (imports) UnusedImports: Unused import - org.apache.spark.sql.api.java.UDF1. [ERROR] src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java:[72] (sizes) LineLength: Line is longer than 100 characters (found 104). [ERROR] src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java:[121] (sizes) LineLength: Line is longer than 100 characters (found 101). [ERROR] src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java:[28,8] (imports) UnusedImports: Unused import - org.apache.spark.api.java.JavaRDD. [ERROR] src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java:[29,8] (imports) UnusedImports: Unused import - org.apache.spark.api.java.JavaSparkContext. ``` ## How was this patch tested? Manually via ```bash ./dev/lint-java ``` Author: hyukjinkwon Closes #17072 from HyukjinKwon/java-lint. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ba9c6c4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ba9c6c4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ba9c6c4 Branch: refs/heads/master Commit: 4ba9c6c453606f5e5a1e324d5f933d2c9307a604 Parents: 9f8e392 Author: hyukjinkwon Authored: Mon Feb 27 08:44:26 2017 + Committer: Sean Owen Committed: Mon Feb 27 08:44:26 2017 + -- .../network/TransportResponseHandlerSuite.java | 1 - .../apache/spark/unsafe/types/UTF8String.java | 3 +- .../org/apache/spark/SparkFirehoseListener.java | 225 ++- .../unsafe/sort/UnsafeExternalSorter.java | 4 +- .../test/org/apache/spark/JavaAPISuite.java | 109 - .../spark/examples/ml/JavaTokenizerExample.java | 4 +- .../mllib/JavaRankingMetricsExample.java| 4 +- .../streaming/JavaKi
[1/2] spark git commit: [SPARK-19660][CORE][SQL] Replace the configuration property names that are deprecated in the version of Hadoop 2.6
Repository: spark Updated Branches: refs/heads/master a350bc16d -> 9b8eca65d http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/merge2-3-10266e3d5dd4c841c0d65030b1edba7c -- diff --git a/sql/hive/src/test/resources/golden/merge2-3-10266e3d5dd4c841c0d65030b1edba7c b/sql/hive/src/test/resources/golden/merge2-3-10266e3d5dd4c841c0d65030b1edba7c new file mode 100644 index 000..573541a --- /dev/null +++ b/sql/hive/src/test/resources/golden/merge2-3-10266e3d5dd4c841c0d65030b1edba7c @@ -0,0 +1 @@ +0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/merge2-3-6e53a3ac93113f20db3a12f1dcf30e86 -- diff --git a/sql/hive/src/test/resources/golden/merge2-3-6e53a3ac93113f20db3a12f1dcf30e86 b/sql/hive/src/test/resources/golden/merge2-3-6e53a3ac93113f20db3a12f1dcf30e86 deleted file mode 100644 index 573541a..000 --- a/sql/hive/src/test/resources/golden/merge2-3-6e53a3ac93113f20db3a12f1dcf30e86 +++ /dev/null @@ -1 +0,0 @@ -0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/merge2-4-84967075baa3e56fff2a23f8ab9ba076 -- diff --git a/sql/hive/src/test/resources/golden/merge2-4-84967075baa3e56fff2a23f8ab9ba076 b/sql/hive/src/test/resources/golden/merge2-4-84967075baa3e56fff2a23f8ab9ba076 deleted file mode 100644 index 573541a..000 --- a/sql/hive/src/test/resources/golden/merge2-4-84967075baa3e56fff2a23f8ab9ba076 +++ /dev/null @@ -1 +0,0 @@ -0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/merge2-4-9cbd6d400fb6c3cd09010e3dbd76601 -- diff --git a/sql/hive/src/test/resources/golden/merge2-4-9cbd6d400fb6c3cd09010e3dbd76601 b/sql/hive/src/test/resources/golden/merge2-4-9cbd6d400fb6c3cd09010e3dbd76601 new file mode 100644 index 000..573541a --- /dev/null +++ b/sql/hive/src/test/resources/golden/merge2-4-9cbd6d400fb6c3cd09010e3dbd76601 @@ -0,0 +1 @@ +0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/merge2-5-1ba2d6f3bb3348da3fee7fab4f283f34 -- diff --git a/sql/hive/src/test/resources/golden/merge2-5-1ba2d6f3bb3348da3fee7fab4f283f34 b/sql/hive/src/test/resources/golden/merge2-5-1ba2d6f3bb3348da3fee7fab4f283f34 new file mode 100644 index 000..573541a --- /dev/null +++ b/sql/hive/src/test/resources/golden/merge2-5-1ba2d6f3bb3348da3fee7fab4f283f34 @@ -0,0 +1 @@ +0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/merge2-5-2ee5d706fe3a3bcc38b795f6e94970ea -- diff --git a/sql/hive/src/test/resources/golden/merge2-5-2ee5d706fe3a3bcc38b795f6e94970ea b/sql/hive/src/test/resources/golden/merge2-5-2ee5d706fe3a3bcc38b795f6e94970ea deleted file mode 100644 index 573541a..000 --- a/sql/hive/src/test/resources/golden/merge2-5-2ee5d706fe3a3bcc38b795f6e94970ea +++ /dev/null @@ -1 +0,0 @@ -0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/parallel-0-23a4feaede17467a8cc26e4d86ec30f9 -- diff --git a/sql/hive/src/test/resources/golden/parallel-0-23a4feaede17467a8cc26e4d86ec30f9 b/sql/hive/src/test/resources/golden/parallel-0-23a4feaede17467a8cc26e4d86ec30f9 deleted file mode 100644 index 573541a..000 --- a/sql/hive/src/test/resources/golden/parallel-0-23a4feaede17467a8cc26e4d86ec30f9 +++ /dev/null @@ -1 +0,0 @@ -0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/parallel-0-6dc30e2de057022e63bd2a645fbec4c2 -- diff --git a/sql/hive/src/test/resources/golden/parallel-0-6dc30e2de057022e63bd2a645fbec4c2 b/sql/hive/src/test/resources/golden/parallel-0-6dc30e2de057022e63bd2a645fbec4c2 new file mode 100644 index 000..573541a --- /dev/null +++ b/sql/hive/src/test/resources/golden/parallel-0-6dc30e2de057022e63bd2a645fbec4c2 @@ -0,0 +1 @@ +0 http://git-wip-us.apache.org/repos/asf/spark/blob/9b8eca65/sql/hive/src/test/resources/golden/rcfile_lazydecompress-11-25715870c569b0f8c3d483e3a38b3199 -- diff --git a/sql/hive/src/test/resources/golden/rcfile_lazydecompress-11-25715870c569b0f8c3d483e3a38b3199 b/sql/hive/src/test/resources/golden/rcfile_lazydecompress-11-25715870c569b0f8c3d483e3a38b3199 new file mode 100644 index 000..573541a --- /dev/null +++ b/sql/hive/src/test/resources/golden/rcfile_lazydecompress-11-25715870c569b0f8c3d483e3a38b3199
[2/2] spark git commit: [SPARK-19660][CORE][SQL] Replace the configuration property names that are deprecated in the version of Hadoop 2.6
[SPARK-19660][CORE][SQL] Replace the configuration property names that are deprecated in the version of Hadoop 2.6 ## What changes were proposed in this pull request? Replace all the Hadoop deprecated configuration property names according to [DeprecatedProperties](https://hadoop.apache.org/docs/r2.6.0/hadoop-project-dist/hadoop-common/DeprecatedProperties.html). except: https://github.com/apache/spark/blob/v2.1.0/python/pyspark/sql/tests.py#L1533 https://github.com/apache/spark/blob/v2.1.0/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala#L987 https://github.com/apache/spark/blob/v2.1.0/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala#L45 https://github.com/apache/spark/blob/v2.1.0/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L614 ## How was this patch tested? Existing tests Author: Yuming Wang Closes #16990 from wangyum/HadoopDeprecatedProperties. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9b8eca65 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9b8eca65 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9b8eca65 Branch: refs/heads/master Commit: 9b8eca65dcf68129470ead39362ce870ffb0bb1d Parents: a350bc1 Author: Yuming Wang Authored: Tue Feb 28 10:13:42 2017 + Committer: Sean Owen Committed: Tue Feb 28 10:13:42 2017 + -- R/WINDOWS.md| 2 +- R/run-tests.sh | 2 +- appveyor.yml| 2 +- .../io/HadoopMapReduceCommitProtocol.scala | 10 ++--- .../io/SparkHadoopMapReduceWriter.scala | 2 +- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 10 ++--- .../org/apache/spark/rdd/PairRDDFunctions.scala | 9 ++-- .../test/scala/org/apache/spark/FileSuite.scala | 5 ++- docs/hardware-provisioning.md | 4 +- python/pyspark/tests.py | 47 ++-- .../spark/sql/execution/command/tables.scala| 4 +- .../datasources/FileFormatWriter.scala | 10 ++--- .../HiveWindowFunctionQuerySuite.scala | 8 ++-- .../org/apache/spark/sql/hive/TableReader.scala | 8 ++-- .../hive/execution/InsertIntoHiveTable.scala| 15 --- .../apache/spark/sql/hive/test/TestHive.scala | 2 +- ...4_hadoop20-2-2b9ccaa793eae0e73bf76335d3d6880 | 1 + ..._hadoop20-2-db1cd54a4cb36de2087605f32e41824f | 1 - .../combine1-2-6142f47d3fcdd4323162014d5eb35e07 | 1 + .../combine1-2-c95dc367df88c9e5cf77157f29ba2daf | 1 - .../combine1-3-10266e3d5dd4c841c0d65030b1edba7c | 1 + .../combine1-3-6e53a3ac93113f20db3a12f1dcf30e86 | 1 - .../combine1-4-84967075baa3e56fff2a23f8ab9ba076 | 1 - .../combine1-4-9cbd6d400fb6c3cd09010e3dbd76601 | 1 + .../combine1-5-1ba2d6f3bb3348da3fee7fab4f283f34 | 1 + .../combine1-5-2ee5d706fe3a3bcc38b795f6e94970ea | 1 - .../combine2-2-6142f47d3fcdd4323162014d5eb35e07 | 1 + .../combine2-2-c95dc367df88c9e5cf77157f29ba2daf | 1 - .../combine2-3-10266e3d5dd4c841c0d65030b1edba7c | 1 + .../combine2-3-6e53a3ac93113f20db3a12f1dcf30e86 | 1 - .../combine2-4-84967075baa3e56fff2a23f8ab9ba076 | 1 - .../combine2-4-9cbd6d400fb6c3cd09010e3dbd76601 | 1 + .../combine2-5-1ba2d6f3bb3348da3fee7fab4f283f34 | 1 + .../combine2-5-2ee5d706fe3a3bcc38b795f6e94970ea | 1 - .../groupby1-3-c8478dac3497697b4375ee35118a5c3e | 1 + .../groupby1-3-d57ed4bbfee1ffaffaeba0a4be84c31d | 1 - .../groupby1-5-c9cee6382b64bd3d71177527961b8be2 | 1 + .../groupby1-5-dd7bf298b8c921355edd8665c6b0c168 | 1 - ...by1_limit-0-83c59d378571a6e487aa20217bd87817 | 1 - ...by1_limit-0-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...upby1_map-2-83c59d378571a6e487aa20217bd87817 | 1 - ...upby1_map-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ..._map_skew-2-83c59d378571a6e487aa20217bd87817 | 1 - ..._map_skew-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...y1_noskew-2-83c59d378571a6e487aa20217bd87817 | 1 - ...y1_noskew-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...by2_limit-0-83c59d378571a6e487aa20217bd87817 | 1 - ...by2_limit-0-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...upby2_map-2-83c59d378571a6e487aa20217bd87817 | 1 - ...upby2_map-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ..._map_skew-2-83c59d378571a6e487aa20217bd87817 | 1 - ..._map_skew-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...y2_noskew-2-83c59d378571a6e487aa20217bd87817 | 1 - ...y2_noskew-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...upby4_map-2-83c59d378571a6e487aa20217bd87817 | 1 - ...upby4_map-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ..._map_skew-2-83c59d378571a6e487aa20217bd87817 | 1 - ..._map_skew-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...y4_noskew-2-83c59d378571a6e487aa20217bd87817 | 1 - ...y4_noskew-2-be2c0b32a02a1154bfdee1a52530f387 | 1 + ...upby5_map-2-83c59d378571a6e487aa20217bd87817 | 1 - ...upby5_map-2-be2c0b32a02a1154bfde
spark git commit: [SPARK-19769][DOCS] Update quickstart instructions
Repository: spark Updated Branches: refs/heads/branch-2.1 947c0cd90 -> d887f7581 [SPARK-19769][DOCS] Update quickstart instructions ## What changes were proposed in this pull request? This change addresses the renaming of the `simple.sbt` build file to `build.sbt`. Newer versions of the sbt tool are not finding the older named file and are looking for the `build.sbt`. The quickstart instructions for self-contained applications is updated with this change. ## How was this patch tested? As this is a relatively minor change of a few words, the markdown was checked for syntax and spelling. Site was built with `SKIP_API=1 jekyll serve` for testing purposes. Author: Michael McCune Closes #17101 from elmiko/spark-19769. (cherry picked from commit bf5987cbe6c9f4a1a91d912ed3a9098111632d1a) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d887f758 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d887f758 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d887f758 Branch: refs/heads/branch-2.1 Commit: d887f758152be4d6e089066a97b1eab817d3be83 Parents: 947c0cd Author: Michael McCune Authored: Wed Mar 1 00:07:16 2017 +0100 Committer: Sean Owen Committed: Wed Mar 1 00:07:26 2017 +0100 -- docs/quick-start.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d887f758/docs/quick-start.md -- diff --git a/docs/quick-start.md b/docs/quick-start.md index 0836c60..478bdcf 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -260,7 +260,7 @@ object which contains information about our application. Our application depends on the Spark API, so we'll also include an sbt configuration file, -`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that +`build.sbt`, which explains that Spark is a dependency. This file also adds a repository that Spark depends on: {% highlight scala %} @@ -273,7 +273,7 @@ scalaVersion := "{{site.SCALA_VERSION}}" libraryDependencies += "org.apache.spark" %% "spark-core" % "{{site.SPARK_VERSION}}" {% endhighlight %} -For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `simple.sbt` +For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `build.sbt` according to the typical directory structure. Once that is in place, we can create a JAR package containing the application's code, then use the `spark-submit` script to run our program. @@ -281,7 +281,7 @@ containing the application's code, then use the `spark-submit` script to run our # Your directory layout should look like this $ find . . -./simple.sbt +./build.sbt ./src ./src/main ./src/main/scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19769][DOCS] Update quickstart instructions
Repository: spark Updated Branches: refs/heads/branch-2.0 dcfb05c86 -> c9c45d97b [SPARK-19769][DOCS] Update quickstart instructions ## What changes were proposed in this pull request? This change addresses the renaming of the `simple.sbt` build file to `build.sbt`. Newer versions of the sbt tool are not finding the older named file and are looking for the `build.sbt`. The quickstart instructions for self-contained applications is updated with this change. ## How was this patch tested? As this is a relatively minor change of a few words, the markdown was checked for syntax and spelling. Site was built with `SKIP_API=1 jekyll serve` for testing purposes. Author: Michael McCune Closes #17101 from elmiko/spark-19769. (cherry picked from commit bf5987cbe6c9f4a1a91d912ed3a9098111632d1a) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c9c45d97 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c9c45d97 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c9c45d97 Branch: refs/heads/branch-2.0 Commit: c9c45d97bd98049b6fd32a4194e8aaff526c034d Parents: dcfb05c Author: Michael McCune Authored: Wed Mar 1 00:07:16 2017 +0100 Committer: Sean Owen Committed: Wed Mar 1 00:07:41 2017 +0100 -- docs/quick-start.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c9c45d97/docs/quick-start.md -- diff --git a/docs/quick-start.md b/docs/quick-start.md index c67b010..70cbccf 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -260,7 +260,7 @@ object which contains information about our application. Our application depends on the Spark API, so we'll also include an sbt configuration file, -`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that +`build.sbt`, which explains that Spark is a dependency. This file also adds a repository that Spark depends on: {% highlight scala %} @@ -273,7 +273,7 @@ scalaVersion := "{{site.SCALA_VERSION}}" libraryDependencies += "org.apache.spark" %% "spark-core" % "{{site.SPARK_VERSION}}" {% endhighlight %} -For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `simple.sbt` +For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `build.sbt` according to the typical directory structure. Once that is in place, we can create a JAR package containing the application's code, then use the `spark-submit` script to run our program. @@ -281,7 +281,7 @@ containing the application's code, then use the `spark-submit` script to run our # Your directory layout should look like this $ find . . -./simple.sbt +./build.sbt ./src ./src/main ./src/main/scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19373][MESOS] Base spark.scheduler.minRegisteredResourceRatio on registered cores rather than accepted cores
Repository: spark Updated Branches: refs/heads/master bf5987cbe -> ca3864d6e [SPARK-19373][MESOS] Base spark.scheduler.minRegisteredResourceRatio on registered cores rather than accepted cores ## What changes were proposed in this pull request? See JIRA ## How was this patch tested? Unit tests, Mesos/Spark integration tests cc skonto susanxhuynh Author: Michael Gummelt Closes #17045 from mgummelt/SPARK-19373-registered-resources. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca3864d6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca3864d6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca3864d6 Branch: refs/heads/master Commit: ca3864d6e090ca3e68a2ef0cf527e6e00c8c4f64 Parents: bf5987c Author: Michael Gummelt Authored: Wed Mar 1 00:10:55 2017 +0100 Committer: Sean Owen Committed: Wed Mar 1 00:10:55 2017 +0100 -- .../MesosCoarseGrainedSchedulerBackend.scala| 27 +++-- ...esosCoarseGrainedSchedulerBackendSuite.scala | 111 +-- 2 files changed, 70 insertions(+), 68 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ca3864d6/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala -- diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index f555072..f69c223 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -54,14 +54,17 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( with org.apache.mesos.Scheduler with MesosSchedulerUtils { - val MAX_SLAVE_FAILURES = 2 // Blacklist a slave after this many failures + // Blacklist a slave after this many failures + private val MAX_SLAVE_FAILURES = 2 - // Maximum number of cores to acquire (TODO: we'll need more flexible controls here) - val maxCores = conf.get("spark.cores.max", Int.MaxValue.toString).toInt + private val maxCoresOption = conf.getOption("spark.cores.max").map(_.toInt) - val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false) + // Maximum number of cores to acquire + private val maxCores = maxCoresOption.getOrElse(Int.MaxValue) - val maxGpus = conf.getInt("spark.mesos.gpus.max", 0) + private val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false) + + private val maxGpus = conf.getInt("spark.mesos.gpus.max", 0) private[this] val shutdownTimeoutMS = conf.getTimeAsMs("spark.mesos.coarse.shutdownTimeout", "10s") @@ -75,10 +78,10 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( private val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false) // Cores we have acquired with each Mesos task ID - val coresByTaskId = new mutable.HashMap[String, Int] - val gpusByTaskId = new mutable.HashMap[String, Int] - var totalCoresAcquired = 0 - var totalGpusAcquired = 0 + private val coresByTaskId = new mutable.HashMap[String, Int] + private val gpusByTaskId = new mutable.HashMap[String, Int] + private var totalCoresAcquired = 0 + private var totalGpusAcquired = 0 // SlaveID -> Slave // This map accumulates entries for the duration of the job. Slaves are never deleted, because @@ -108,7 +111,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( // may lead to deadlocks since the superclass might also try to lock private val stateLock = new ReentrantLock - val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0) + private val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0) // Offer constraints private val slaveOfferConstraints = @@ -139,7 +142,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( securityManager.isAuthenticationEnabled()) } - var nextMesosTaskId = 0 + private var nextMesosTaskId = 0 @volatile var appId: String = _ @@ -256,7 +259,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( } override def sufficientResourcesRegistered(): Boolean = { -totalCoresAcquired >= maxCores * minRegisteredRatio +totalCoreCount.get >= maxCoresOption.getOrElse(0) * minRegisteredRatio } override def disconnected(d: org.apache.mesos.SchedulerDriver) {} http://git-wip-us.apache.org/repos/asf/spark/blob/ca3864d6/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/meso
spark git commit: [SPARK-19769][DOCS] Update quickstart instructions
Repository: spark Updated Branches: refs/heads/master d743ea4c7 -> bf5987cbe [SPARK-19769][DOCS] Update quickstart instructions ## What changes were proposed in this pull request? This change addresses the renaming of the `simple.sbt` build file to `build.sbt`. Newer versions of the sbt tool are not finding the older named file and are looking for the `build.sbt`. The quickstart instructions for self-contained applications is updated with this change. ## How was this patch tested? As this is a relatively minor change of a few words, the markdown was checked for syntax and spelling. Site was built with `SKIP_API=1 jekyll serve` for testing purposes. Author: Michael McCune Closes #17101 from elmiko/spark-19769. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bf5987cb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bf5987cb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bf5987cb Branch: refs/heads/master Commit: bf5987cbe6c9f4a1a91d912ed3a9098111632d1a Parents: d743ea4 Author: Michael McCune Authored: Wed Mar 1 00:07:16 2017 +0100 Committer: Sean Owen Committed: Wed Mar 1 00:07:16 2017 +0100 -- docs/quick-start.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bf5987cb/docs/quick-start.md -- diff --git a/docs/quick-start.md b/docs/quick-start.md index 04ac278..aa4319a 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -260,7 +260,7 @@ object which contains information about our application. Our application depends on the Spark API, so we'll also include an sbt configuration file, -`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that +`build.sbt`, which explains that Spark is a dependency. This file also adds a repository that Spark depends on: {% highlight scala %} @@ -273,7 +273,7 @@ scalaVersion := "{{site.SCALA_VERSION}}" libraryDependencies += "org.apache.spark" %% "spark-core" % "{{site.SPARK_VERSION}}" {% endhighlight %} -For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `simple.sbt` +For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `build.sbt` according to the typical directory structure. Once that is in place, we can create a JAR package containing the application's code, then use the `spark-submit` script to run our program. @@ -281,7 +281,7 @@ containing the application's code, then use the `spark-submit` script to run our # Your directory layout should look like this $ find . . -./simple.sbt +./build.sbt ./src ./src/main ./src/main/scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19373][MESOS] Base spark.scheduler.minRegisteredResourceRatio …
Repository: spark Updated Branches: refs/heads/branch-2.1 bbe0d8caa -> 27347b5f2 [SPARK-19373][MESOS] Base spark.scheduler.minRegisteredResourceRatio ⦠â¦on registered cores rather than accepted cores See JIRA Unit tests, Mesos/Spark integration tests cc skonto susanxhuynh Author: Michael Gummelt Closes #17045 from mgummelt/SPARK-19373-registered-resources. ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Michael Gummelt Closes #17129 from mgummelt/SPARK-19373-registered-resources-2.1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27347b5f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27347b5f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27347b5f Branch: refs/heads/branch-2.1 Commit: 27347b5f26f668783d8ded89149a5e761b67f786 Parents: bbe0d8c Author: Michael Gummelt Authored: Thu Mar 2 00:32:32 2017 +0100 Committer: Sean Owen Committed: Thu Mar 2 00:32:32 2017 +0100 -- .../MesosCoarseGrainedSchedulerBackend.scala| 27 +++-- ...esosCoarseGrainedSchedulerBackendSuite.scala | 111 +-- 2 files changed, 70 insertions(+), 68 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27347b5f/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala -- diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 5063c1f..22df2b1 100644 --- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -54,14 +54,17 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( with org.apache.mesos.Scheduler with MesosSchedulerUtils { - val MAX_SLAVE_FAILURES = 2 // Blacklist a slave after this many failures + // Blacklist a slave after this many failures + private val MAX_SLAVE_FAILURES = 2 - // Maximum number of cores to acquire (TODO: we'll need more flexible controls here) - val maxCores = conf.get("spark.cores.max", Int.MaxValue.toString).toInt + private val maxCoresOption = conf.getOption("spark.cores.max").map(_.toInt) - val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false) + // Maximum number of cores to acquire + private val maxCores = maxCoresOption.getOrElse(Int.MaxValue) - val maxGpus = conf.getInt("spark.mesos.gpus.max", 0) + private val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false) + + private val maxGpus = conf.getInt("spark.mesos.gpus.max", 0) private[this] val shutdownTimeoutMS = conf.getTimeAsMs("spark.mesos.coarse.shutdownTimeout", "10s") @@ -75,10 +78,10 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( private val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false) // Cores we have acquired with each Mesos task ID - val coresByTaskId = new mutable.HashMap[String, Int] - val gpusByTaskId = new mutable.HashMap[String, Int] - var totalCoresAcquired = 0 - var totalGpusAcquired = 0 + private val coresByTaskId = new mutable.HashMap[String, Int] + private val gpusByTaskId = new mutable.HashMap[String, Int] + private var totalCoresAcquired = 0 + private var totalGpusAcquired = 0 // SlaveID -> Slave // This map accumulates entries for the duration of the job. Slaves are never deleted, because @@ -108,7 +111,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( // may lead to deadlocks since the superclass might also try to lock private val stateLock = new ReentrantLock - val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0) + private val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0) // Offer constraints private val slaveOfferConstraints = @@ -140,7 +143,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( securityManager.isSaslEncryptionEnabled()) } - var nextMesosTaskId = 0 + private var nextMesosTaskId = 0 @volatile var appId: String = _ @@ -257,7 +260,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( } override def sufficientResourcesRegistered(): Boolean = { -tota
spark git commit: [SPARK-19775][SQL] Remove an obsolete `partitionBy().insertInto()` test case
Repository: spark Updated Branches: refs/heads/master 2ff1467d6 -> db0ddce52 [SPARK-19775][SQL] Remove an obsolete `partitionBy().insertInto()` test case ## What changes were proposed in this pull request? This issue removes [a test case](https://github.com/apache/spark/blame/master/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala#L287-L298) which was introduced by [SPARK-14459](https://github.com/apache/spark/commit/652bbb1bf62722b08a062c7a2bf72019f85e179e) and was superseded by [SPARK-16033](https://github.com/apache/spark/blame/master/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala#L365-L371). Basically, we cannot use `partitionBy` and `insertInto` together. ```scala test("Reject partitioning that does not match table") { withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) { sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")) .toDF("id", "data", "part") intercept[AnalysisException] { // cannot partition by 2 fields when there is only one in the table definition data.write.partitionBy("part", "data").insertInto("partitioned") } } } ``` ## How was this patch tested? This only removes a test case. Pass the existing Jenkins test. Author: Dongjoon Hyun Closes #17106 from dongjoon-hyun/SPARK-19775. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db0ddce5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db0ddce5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db0ddce5 Branch: refs/heads/master Commit: db0ddce523bb823cba996e92ef36ceca31492d2c Parents: 2ff1467 Author: Dongjoon Hyun Authored: Thu Mar 2 00:45:59 2017 +0100 Committer: Sean Owen Committed: Thu Mar 2 00:45:59 2017 +0100 -- .../spark/sql/hive/InsertIntoHiveTableSuite.scala | 13 - 1 file changed, 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db0ddce5/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala index 71ce5a7..d6999af 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala @@ -284,19 +284,6 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef sql("DROP TABLE hiveTableWithStructValue") } - test("Reject partitioning that does not match table") { -withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) { - sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") - val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")) - .toDF("id", "data", "part") - - intercept[AnalysisException] { -// cannot partition by 2 fields when there is only one in the table definition -data.write.partitionBy("part", "data").insertInto("partitioned") - } -} - } - test("Test partition mode = strict") { withSQLConf(("hive.exec.dynamic.partition.mode", "strict")) { sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19739][CORE] propagate S3 session token to cluser
Repository: spark Updated Branches: refs/heads/master d556b3170 -> fa50143cd [SPARK-19739][CORE] propagate S3 session token to cluser ## What changes were proposed in this pull request? propagate S3 session token to cluser ## How was this patch tested? existing ut Author: uncleGen Closes #17080 from uncleGen/SPARK-19739. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa50143c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa50143c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa50143c Branch: refs/heads/master Commit: fa50143cd33586f4658892f434c9f6c23346e1bf Parents: d556b31 Author: uncleGen Authored: Fri Mar 3 11:49:00 2017 +0100 Committer: Sean Owen Committed: Fri Mar 3 11:49:00 2017 +0100 -- .../org/apache/spark/deploy/SparkHadoopUtil.scala | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa50143c/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 941e2d1..f475ce8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -82,17 +82,20 @@ class SparkHadoopUtil extends Logging { // the behavior of the old implementation of this code, for backwards compatibility. if (conf != null) { // Explicitly check for S3 environment variables - if (System.getenv("AWS_ACCESS_KEY_ID") != null && - System.getenv("AWS_SECRET_ACCESS_KEY") != null) { -val keyId = System.getenv("AWS_ACCESS_KEY_ID") -val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY") - + val keyId = System.getenv("AWS_ACCESS_KEY_ID") + val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY") + if (keyId != null && accessKey != null) { hadoopConf.set("fs.s3.awsAccessKeyId", keyId) hadoopConf.set("fs.s3n.awsAccessKeyId", keyId) hadoopConf.set("fs.s3a.access.key", keyId) hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3a.secret.key", accessKey) + +val sessionToken = System.getenv("AWS_SESSION_TOKEN") +if (sessionToken != null) { + hadoopConf.set("fs.s3a.session.token", sessionToken) +} } // Copy any "spark.hadoop.foo=bar" system properties into conf as "foo=bar" conf.getAll.foreach { case (key, value) => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19797][DOC] ML pipeline document correction
Repository: spark Updated Branches: refs/heads/branch-2.1 1237aaea2 -> accbed7c2 [SPARK-19797][DOC] ML pipeline document correction ## What changes were proposed in this pull request? Description about pipeline in this paragraph is incorrect https://spark.apache.org/docs/latest/ml-pipeline.html#how-it-works > If the Pipeline had more **stages**, it would call the > LogisticRegressionModelâs transform() method on the DataFrame before > passing the DataFrame to the next stage. Reason: Transformer could also be a stage. But only another Estimator will invoke an transform call and pass the data to next stage. The description in the document misleads ML pipeline users. ## How was this patch tested? This is a tiny modification of **docs/ml-pipelines.md**. I jekyll build the modification and check the compiled document. Author: Zhe Sun Closes #17137 from ymwdalex/SPARK-19797-ML-pipeline-document-correction. (cherry picked from commit 0bac3e4cde75678beac02e67b8873fe779e9ad34) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/accbed7c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/accbed7c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/accbed7c Branch: refs/heads/branch-2.1 Commit: accbed7c2cfbe46fa6f55e97241b617c6ad4431f Parents: 1237aae Author: Zhe Sun Authored: Fri Mar 3 11:55:57 2017 +0100 Committer: Sean Owen Committed: Fri Mar 3 11:56:07 2017 +0100 -- docs/ml-pipeline.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/accbed7c/docs/ml-pipeline.md -- diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md index 7cbb146..aa92c0a 100644 --- a/docs/ml-pipeline.md +++ b/docs/ml-pipeline.md @@ -132,7 +132,7 @@ The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`. The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`. Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`. -If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` +If the `Pipeline` had more `Estimator`s, it would call the `LogisticRegressionModel`'s `transform()` method on the `DataFrame` before passing the `DataFrame` to the next stage. A `Pipeline` is an `Estimator`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19797][DOC] ML pipeline document correction
Repository: spark Updated Branches: refs/heads/master fa50143cd -> 0bac3e4cd [SPARK-19797][DOC] ML pipeline document correction ## What changes were proposed in this pull request? Description about pipeline in this paragraph is incorrect https://spark.apache.org/docs/latest/ml-pipeline.html#how-it-works > If the Pipeline had more **stages**, it would call the > LogisticRegressionModelâs transform() method on the DataFrame before > passing the DataFrame to the next stage. Reason: Transformer could also be a stage. But only another Estimator will invoke an transform call and pass the data to next stage. The description in the document misleads ML pipeline users. ## How was this patch tested? This is a tiny modification of **docs/ml-pipelines.md**. I jekyll build the modification and check the compiled document. Author: Zhe Sun Closes #17137 from ymwdalex/SPARK-19797-ML-pipeline-document-correction. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0bac3e4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0bac3e4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0bac3e4c Branch: refs/heads/master Commit: 0bac3e4cde75678beac02e67b8873fe779e9ad34 Parents: fa50143 Author: Zhe Sun Authored: Fri Mar 3 11:55:57 2017 +0100 Committer: Sean Owen Committed: Fri Mar 3 11:55:57 2017 +0100 -- docs/ml-pipeline.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0bac3e4c/docs/ml-pipeline.md -- diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md index 7cbb146..aa92c0a 100644 --- a/docs/ml-pipeline.md +++ b/docs/ml-pipeline.md @@ -132,7 +132,7 @@ The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`. The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`. Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`. -If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` +If the `Pipeline` had more `Estimator`s, it would call the `LogisticRegressionModel`'s `transform()` method on the `DataFrame` before passing the `DataFrame` to the next stage. A `Pipeline` is an `Estimator`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19801][BUILD] Remove JDK7 from Travis CI
Repository: spark Updated Branches: refs/heads/master 0bac3e4cd -> 776fac398 [SPARK-19801][BUILD] Remove JDK7 from Travis CI ## What changes were proposed in this pull request? Since Spark 2.1.0, Travis CI was supported by SPARK-15207 for automated PR verification (JDK7/JDK8 maven compilation and Java Linter) and contributors can see the additional result via their Travis CI dashboard (or PC). This PR aims to make `.travis.yml` up-to-date by removing JDK7 which was removed via SPARK-19550. ## How was this patch tested? See the result via Travis CI. - https://travis-ci.org/dongjoon-hyun/spark/builds/207111713 Author: Dongjoon Hyun Closes #17143 from dongjoon-hyun/SPARK-19801. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/776fac39 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/776fac39 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/776fac39 Branch: refs/heads/master Commit: 776fac3988271a1e4128cb31f21e5f7f3b7bcf0e Parents: 0bac3e4 Author: Dongjoon Hyun Authored: Fri Mar 3 12:00:54 2017 +0100 Committer: Sean Owen Committed: Fri Mar 3 12:00:54 2017 +0100 -- .travis.yml | 1 - 1 file changed, 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/776fac39/.travis.yml -- diff --git a/.travis.yml b/.travis.yml index d94872d..d7e9f8c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,6 @@ dist: trusty # 2. Choose language and target JDKs for parallel builds. language: java jdk: - - oraclejdk7 - oraclejdk8 # 3. Setup cache directory for SBT and Maven. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark-website git commit: Update commiter list
Repository: spark-website Updated Branches: refs/heads/asf-site 470b7ed51 -> c1b9ad3cb Update commiter list Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/c1b9ad3c Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/c1b9ad3c Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/c1b9ad3c Branch: refs/heads/asf-site Commit: c1b9ad3cbe413b10f872c6a3363f1028c31b1a16 Parents: 470b7ed Author: Holden Karau Authored: Wed Mar 1 22:15:10 2017 -0800 Committer: Sean Owen Committed: Fri Mar 3 12:31:03 2017 +0100 -- committers.md| 4 site/committers.html | 15 ++- 2 files changed, 18 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/c1b9ad3c/committers.md -- diff --git a/committers.md b/committers.md index 03defa6..a97bb72 100644 --- a/committers.md +++ b/committers.md @@ -28,6 +28,7 @@ navigation: |Herman van Hovell|QuestTec B.V.| |Yin Huai|Databricks| |Shane Huang|Intel| +|Holden Karau|IBM| |Andy Konwinski|Databricks| |Ryan LeCompte|Quantifind| |Haoyuan Li|Alluxio, UC Berkeley| @@ -50,11 +51,13 @@ navigation: |Prashant Sharma|IBM| |Ram Sriharsha|Databricks| |DB Tsai|Netflix| +|Takuya Ueshin|| |Marcelo Vanzin|Cloudera| |Shivaram Venkataraman|UC Berkeley| |Patrick Wendell|Databricks| |Andrew Xia|Alibaba| |Reynold Xin|Databricks| +|Burak Yavuz|Databricks| |Matei Zaharia|Databricks, Stanford| |Shixiong Zhu|Databricks| @@ -117,6 +120,7 @@ You can verify the result is one change with `git log`. Then resume the script i Also, please remember to set Assignee on JIRAs where applicable when they are resolved. The script can't do this automatically. +Once a PR is merged please leave a comment on the PR stating which branch(es) it has been merged with.
spark git commit: [SPARK-19550][SPARKR][DOCS] Update R document to use JDK8
Repository: spark Updated Branches: refs/heads/master fbc405803 -> 6b0cfd9fa [SPARK-19550][SPARKR][DOCS] Update R document to use JDK8 ## What changes were proposed in this pull request? Update R document to use JDK8. ## How was this patch tested? manual tests Author: Yuming Wang Closes #17162 from wangyum/SPARK-19550. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b0cfd9f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b0cfd9f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b0cfd9f Branch: refs/heads/master Commit: 6b0cfd9fa51aca4536d7c3f2a4bbceae11a50339 Parents: fbc4058 Author: Yuming Wang Authored: Sat Mar 4 16:43:31 2017 + Committer: Sean Owen Committed: Sat Mar 4 16:43:31 2017 + -- R/WINDOWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6b0cfd9f/R/WINDOWS.md -- diff --git a/R/WINDOWS.md b/R/WINDOWS.md index cb2eebb..9ca7e58 100644 --- a/R/WINDOWS.md +++ b/R/WINDOWS.md @@ -6,7 +6,7 @@ To build SparkR on Windows, the following steps are required include Rtools and R in `PATH`. 2. Install -[JDK7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) and set +[JDK8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) and set `JAVA_HOME` in the system environment variables. 3. Download and install [Maven](http://maven.apache.org/download.html). Also include the `bin` - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19792][WEBUI] In the Master Page,the column named “Memory per Node” ,I think it is not all right
Repository: spark Updated Branches: refs/heads/master 6b0cfd9fa -> 42c4cd9e2 [SPARK-19792][WEBUI] In the Master Page,the column named âMemory per Nodeâ ,I think it is not all right Signed-off-by: liuxian ## What changes were proposed in this pull request? Open the spark web page,in the Master Page ,have two tables:Running Applications table and Completed Applications table, to the column named âMemory per Nodeâ ,I think it is not all right ,because a node may be not have only one executor.So I think that should be named as âMemory per Executorâ.Otherwise easy to let the user misunderstanding ## How was this patch tested? N/A Author: liuxian Closes #17132 from 10110346/wid-lx-0302. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/42c4cd9e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/42c4cd9e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/42c4cd9e Branch: refs/heads/master Commit: 42c4cd9e2a44eaa6a16e3b490eb82b6292d9b2ea Parents: 6b0cfd9 Author: liuxian Authored: Sun Mar 5 10:23:50 2017 + Committer: Sean Owen Committed: Sun Mar 5 10:23:50 2017 + -- .../main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/42c4cd9e/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala index 7dbe329..e722a24 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala @@ -76,7 +76,7 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { val aliveWorkers = state.workers.filter(_.state == WorkerState.ALIVE) val workerTable = UIUtils.listingTable(workerHeaders, workerRow, workers) -val appHeaders = Seq("Application ID", "Name", "Cores", "Memory per Node", "Submitted Time", +val appHeaders = Seq("Application ID", "Name", "Cores", "Memory per Executor", "Submitted Time", "User", "State", "Duration") val activeApps = state.activeApps.sortBy(_.startTime).reverse val activeAppsTable = UIUtils.listingTable(appHeaders, appRow, activeApps) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19702][MESOS] Increase default refuse_seconds timeout in the Mesos Spark Dispatcher
Repository: spark Updated Branches: refs/heads/master 6f4684622 -> 2e30c0b9b [SPARK-19702][MESOS] Increase default refuse_seconds timeout in the Mesos Spark Dispatcher ## What changes were proposed in this pull request? Increase default refuse_seconds timeout, and make it configurable. See JIRA for details on how this reduces the risk of starvation. ## How was this patch tested? Unit tests, Manual testing, and Mesos/Spark integration test suite cc susanxhuynh skonto jmlvanre Author: Michael Gummelt Closes #17031 from mgummelt/SPARK-19702-suppress-revive. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2e30c0b9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2e30c0b9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2e30c0b9 Branch: refs/heads/master Commit: 2e30c0b9bcaa6f7757bd85d1f1ec392d5f916f83 Parents: 6f46846 Author: Michael Gummelt Authored: Tue Mar 7 21:29:08 2017 + Committer: Sean Owen Committed: Tue Mar 7 21:29:08 2017 + -- .../cluster/mesos/MesosClusterScheduler.scala | 75 ++-- .../MesosCoarseGrainedSchedulerBackend.scala| 69 -- .../MesosFineGrainedSchedulerBackend.scala | 19 +++-- .../cluster/mesos/MesosSchedulerUtils.scala | 60 .../mesos/MesosClusterSchedulerSuite.scala | 51 - ...esosCoarseGrainedSchedulerBackendSuite.scala | 7 +- .../spark/scheduler/cluster/mesos/Utils.scala | 11 +++ 7 files changed, 187 insertions(+), 105 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2e30c0b9/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala -- diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index 2760f31..1bc6f71 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -152,6 +152,7 @@ private[spark] class MesosClusterScheduler( // is registered with Mesos master. @volatile protected var ready = false private var masterInfo: Option[MasterInfo] = None + private var schedulerDriver: SchedulerDriver = _ def submitDriver(desc: MesosDriverDescription): CreateSubmissionResponse = { val c = new CreateSubmissionResponse @@ -168,9 +169,8 @@ private[spark] class MesosClusterScheduler( return c } c.submissionId = desc.submissionId - queuedDriversState.persist(desc.submissionId, desc) - queuedDrivers += desc c.success = true + addDriverToQueue(desc) } c } @@ -191,7 +191,7 @@ private[spark] class MesosClusterScheduler( // 4. Check if it has already completed. if (launchedDrivers.contains(submissionId)) { val task = launchedDrivers(submissionId) -mesosDriver.killTask(task.taskId) +schedulerDriver.killTask(task.taskId) k.success = true k.message = "Killing running driver" } else if (removeFromQueuedDrivers(submissionId)) { @@ -324,7 +324,7 @@ private[spark] class MesosClusterScheduler( ready = false metricsSystem.report() metricsSystem.stop() -mesosDriver.stop(true) +schedulerDriver.stop(true) } override def registered( @@ -340,6 +340,8 @@ private[spark] class MesosClusterScheduler( stateLock.synchronized { this.masterInfo = Some(masterInfo) + this.schedulerDriver = driver + if (!pendingRecover.isEmpty) { // Start task reconciliation if we need to recover. val statuses = pendingRecover.collect { @@ -506,11 +508,10 @@ private[spark] class MesosClusterScheduler( } private class ResourceOffer( - val offerId: OfferID, - val slaveId: SlaveID, - var resources: JList[Resource]) { + val offer: Offer, + var remainingResources: JList[Resource]) { override def toString(): String = { - s"Offer id: ${offerId}, resources: ${resources}" + s"Offer id: ${offer.getId}, resources: ${remainingResources}" } } @@ -518,16 +519,16 @@ private[spark] class MesosClusterScheduler( val taskId = TaskID.newBuilder().setValue(desc.submissionId).build() val (remainingResources, cpuResourcesToUse) = - partitionResources(offer.resources, "cpus", desc.cores) + partitionResources(offer.remainingResources, "cpus", desc.cores) val (finalResources, memResourcesToUse) = partitionResources(remainingResources.asJava, "m
spark git commit: [SPARK-19693][SQL] Make the SET mapreduce.job.reduces automatically converted to spark.sql.shuffle.partitions
Repository: spark Updated Branches: refs/heads/master 81303f7ca -> 3f9f9180c [SPARK-19693][SQL] Make the SET mapreduce.job.reduces automatically converted to spark.sql.shuffle.partitions ## What changes were proposed in this pull request? Make the `SET mapreduce.job.reduces` automatically converted to `spark.sql.shuffle.partitions`, it's similar to `SET mapred.reduce.tasks`. ## How was this patch tested? unit tests Author: Yuming Wang Closes #17020 from wangyum/SPARK-19693. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f9f9180 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f9f9180 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f9f9180 Branch: refs/heads/master Commit: 3f9f9180c2e695ad468eb813df5feec41e169531 Parents: 81303f7 Author: Yuming Wang Authored: Wed Mar 8 11:31:01 2017 + Committer: Sean Owen Committed: Wed Mar 8 11:31:01 2017 + -- .../spark/sql/execution/command/SetCommand.scala | 17 + .../org/apache/spark/sql/internal/SQLConf.scala| 4 .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 12 3 files changed, 33 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3f9f9180/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala index 7afa4e7..5f12830 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala @@ -60,6 +60,23 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm } (keyValueOutput, runFunc) +case Some((SQLConf.Replaced.MAPREDUCE_JOB_REDUCES, Some(value))) => + val runFunc = (sparkSession: SparkSession) => { +logWarning( + s"Property ${SQLConf.Replaced.MAPREDUCE_JOB_REDUCES} is Hadoop's property, " + +s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS.key} instead.") +if (value.toInt < 1) { + val msg = +s"Setting negative ${SQLConf.Replaced.MAPREDUCE_JOB_REDUCES} for automatically " + + "determining the number of reducers is not supported." + throw new IllegalArgumentException(msg) +} else { + sparkSession.conf.set(SQLConf.SHUFFLE_PARTITIONS.key, value) + Seq(Row(SQLConf.SHUFFLE_PARTITIONS.key, value)) +} + } + (keyValueOutput, runFunc) + case Some((key @ SetCommand.VariableName(name), Some(value))) => val runFunc = (sparkSession: SparkSession) => { sparkSession.conf.set(name, value) http://git-wip-us.apache.org/repos/asf/spark/blob/3f9f9180/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 461dfe3..fd3acd4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -677,6 +677,10 @@ object SQLConf { object Deprecated { val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks" } + + object Replaced { +val MAPREDUCE_JOB_REDUCES = "mapreduce.job.reduces" + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/3f9f9180/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 468ea05..d9e0196 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -1019,6 +1019,18 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { spark.sessionState.conf.clear() } + test("SET mapreduce.job.reduces automatically converted to spark.sql.shuffle.partitions") { +spark.sessionState.conf.clear() +val before = spark.conf.get(SQLConf.SHUFFLE_PARTITIONS.key).toInt +val newConf = before + 1 +sql(s"SET mapreduce.job.reduces=${newConf.toString}") +val after = spark.conf.get(SQLConf.SHUFFLE_PARTITIONS.key).toInt +assert(before != after) +assert(newConf === after) +intercept[IllegalArgumentException](sql(s"SET mapreduce.job.reduces=-1")) +spark.sessionState.conf.clear() + } +
spark git commit: [SPARK-16440][MLLIB] Ensure broadcasted variables are destroyed even in case of exception
Repository: spark Updated Branches: refs/heads/master 3f9f9180c -> 9ea201cf6 [SPARK-16440][MLLIB] Ensure broadcasted variables are destroyed even in case of exception ## What changes were proposed in this pull request? Ensure broadcasted variable are destroyed even in case of exception ## How was this patch tested? Word2VecSuite was run locally Author: Anthony Truchet Closes #14299 from AnthonyTruchet/SPARK-16440. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ea201cf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ea201cf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ea201cf Branch: refs/heads/master Commit: 9ea201cf6482c9c62c9428759d238063db62d66e Parents: 3f9f918 Author: Anthony Truchet Authored: Wed Mar 8 11:44:25 2017 + Committer: Sean Owen Committed: Wed Mar 8 11:44:25 2017 + -- .../org/apache/spark/mllib/feature/Word2Vec.scala | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9ea201cf/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 2364d43..531c8b0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -30,6 +30,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD +import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -314,6 +315,20 @@ class Word2Vec extends Serializable with Logging { val expTable = sc.broadcast(createExpTable()) val bcVocab = sc.broadcast(vocab) val bcVocabHash = sc.broadcast(vocabHash) +try { + doFit(dataset, sc, expTable, bcVocab, bcVocabHash) +} finally { + expTable.destroy(blocking = false) + bcVocab.destroy(blocking = false) + bcVocabHash.destroy(blocking = false) +} + } + + private def doFit[S <: Iterable[String]]( +dataset: RDD[S], sc: SparkContext, +expTable: Broadcast[Array[Float]], +bcVocab: Broadcast[Array[VocabWord]], +bcVocabHash: Broadcast[mutable.HashMap[String, Int]]) = { // each partition is a collection of sentences, // will be translated into arrays of Index integer val sentences: RDD[Array[Int]] = dataset.mapPartitions { sentenceIter => @@ -435,9 +450,6 @@ class Word2Vec extends Serializable with Logging { bcSyn1Global.destroy(false) } newSentences.unpersist() -expTable.destroy(false) -bcVocab.destroy(false) -bcVocabHash.destroy(false) val wordArray = vocab.map(_.word) new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOCS][SS] fix structured streaming python example
Repository: spark Updated Branches: refs/heads/branch-2.1 e481a7381 -> f9833c66a [DOCS][SS] fix structured streaming python example ## What changes were proposed in this pull request? - SS python example: `TypeError: 'xxx' object is not callable` - some other doc issue. ## How was this patch tested? Jenkins. Author: uncleGen Closes #17257 from uncleGen/docs-ss-python. (cherry picked from commit e29a74d5b1fa3f9356b7af5dd7e3fce49bc8eb7d) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f9833c66 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f9833c66 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f9833c66 Branch: refs/heads/branch-2.1 Commit: f9833c66a2f11414357854dae00e9e2448869254 Parents: e481a73 Author: uncleGen Authored: Sun Mar 12 08:29:37 2017 + Committer: Sean Owen Committed: Sun Mar 12 08:29:46 2017 + -- docs/structured-streaming-programming-guide.md| 18 +- .../execution/streaming/FileStreamSource.scala| 2 +- .../streaming/dstream/FileInputDStream.scala | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f9833c66/docs/structured-streaming-programming-guide.md -- diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index 45ee551..d316e04 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -545,7 +545,7 @@ spark = SparkSession. ... # Read text from socket socketDF = spark \ -.readStream() \ +.readStream \ .format("socket") \ .option("host", "localhost") \ .option("port", ) \ @@ -558,7 +558,7 @@ socketDF.printSchema() # Read all the csv files written atomically in a directory userSchema = StructType().add("name", "string").add("age", "integer") csvDF = spark \ -.readStream() \ +.readStream \ .option("sep", ";") \ .schema(userSchema) \ .csv("/path/to/directory") # Equivalent to format("csv").load("/path/to/directory") @@ -995,7 +995,7 @@ Here is the compatibility matrix. Update mode uses watermark to drop old aggregation state. -Complete mode does drop not old aggregation state since by definition this mode +Complete mode does not drop old aggregation state since by definition this mode preserves all data in the Result Table. @@ -1217,13 +1217,13 @@ noAggDF = deviceDataDf.select("device").where("signal > 10") # Print new data to console noAggDF \ -.writeStream() \ +.writeStream \ .format("console") \ .start() # Write new data to Parquet files noAggDF \ -.writeStream() \ +.writeStream \ .format("parquet") \ .option("checkpointLocation", "path/to/checkpoint/dir") \ .option("path", "path/to/destination/dir") \ @@ -1234,14 +1234,14 @@ aggDF = df.groupBy("device").count() # Print updated aggregations to console aggDF \ -.writeStream() \ +.writeStream \ .outputMode("complete") \ .format("console") \ .start() # Have all the aggregates in an in memory table. The query name will be the table name aggDF \ -.writeStream() \ +.writeStream \ .queryName("aggregates") \ .outputMode("complete") \ .format("memory") \ @@ -1329,7 +1329,7 @@ query.lastProgress();// the most recent progress update of this streaming qu {% highlight python %} -query = df.writeStream().format("console").start() # get the query object +query = df.writeStream.format("console").start() # get the query object query.id() # get the unique identifier of the running query that persists across restarts from checkpoint data @@ -1674,7 +1674,7 @@ aggDF {% highlight python %} aggDF \ -.writeStream() \ +.writeStream \ .outputMode("complete") \ .option("checkpointLocation", "path/to/HDFS/dir") \ .format("memory") \ http://git-wip-us.apache.org/repos/asf/spark/blob/f9833c66/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala index 0f0b6f1..fd94bb6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala @@ -86,7 +86,7 @@ class FileStreamSource( } seenFiles.purge() - logInfo(s"maxFi
spark git commit: [DOCS][SS] fix structured streaming python example
Repository: spark Updated Branches: refs/heads/master f6fdf92d0 -> e29a74d5b [DOCS][SS] fix structured streaming python example ## What changes were proposed in this pull request? - SS python example: `TypeError: 'xxx' object is not callable` - some other doc issue. ## How was this patch tested? Jenkins. Author: uncleGen Closes #17257 from uncleGen/docs-ss-python. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e29a74d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e29a74d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e29a74d5 Branch: refs/heads/master Commit: e29a74d5b1fa3f9356b7af5dd7e3fce49bc8eb7d Parents: f6fdf92 Author: uncleGen Authored: Sun Mar 12 08:29:37 2017 + Committer: Sean Owen Committed: Sun Mar 12 08:29:37 2017 + -- docs/structured-streaming-programming-guide.md| 18 +- .../execution/streaming/FileStreamSource.scala| 2 +- .../streaming/dstream/FileInputDStream.scala | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e29a74d5/docs/structured-streaming-programming-guide.md -- diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index 995ac77..7988472 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -539,7 +539,7 @@ spark = SparkSession. ... # Read text from socket socketDF = spark \ -.readStream() \ +.readStream \ .format("socket") \ .option("host", "localhost") \ .option("port", ) \ @@ -552,7 +552,7 @@ socketDF.printSchema() # Read all the csv files written atomically in a directory userSchema = StructType().add("name", "string").add("age", "integer") csvDF = spark \ -.readStream() \ +.readStream \ .option("sep", ";") \ .schema(userSchema) \ .csv("/path/to/directory") # Equivalent to format("csv").load("/path/to/directory") @@ -971,7 +971,7 @@ Here is the compatibility matrix. Update mode uses watermark to drop old aggregation state. -Complete mode does drop not old aggregation state since by definition this mode +Complete mode does not drop old aggregation state since by definition this mode preserves all data in the Result Table. @@ -1201,13 +1201,13 @@ noAggDF = deviceDataDf.select("device").where("signal > 10") # Print new data to console noAggDF \ -.writeStream() \ +.writeStream \ .format("console") \ .start() # Write new data to Parquet files noAggDF \ -.writeStream() \ +.writeStream \ .format("parquet") \ .option("checkpointLocation", "path/to/checkpoint/dir") \ .option("path", "path/to/destination/dir") \ @@ -1218,14 +1218,14 @@ aggDF = df.groupBy("device").count() # Print updated aggregations to console aggDF \ -.writeStream() \ +.writeStream \ .outputMode("complete") \ .format("console") \ .start() # Have all the aggregates in an in memory table. The query name will be the table name aggDF \ -.writeStream() \ +.writeStream \ .queryName("aggregates") \ .outputMode("complete") \ .format("memory") \ @@ -1313,7 +1313,7 @@ query.lastProgress();// the most recent progress update of this streaming qu {% highlight python %} -query = df.writeStream().format("console").start() # get the query object +query = df.writeStream.format("console").start() # get the query object query.id() # get the unique identifier of the running query that persists across restarts from checkpoint data @@ -1658,7 +1658,7 @@ aggDF {% highlight python %} aggDF \ -.writeStream() \ +.writeStream \ .outputMode("complete") \ .option("checkpointLocation", "path/to/HDFS/dir") \ .format("memory") \ http://git-wip-us.apache.org/repos/asf/spark/blob/e29a74d5/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala index 411a15f..a9e64c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala @@ -97,7 +97,7 @@ class FileStreamSource( } seenFiles.purge() - logInfo(s"maxFilesPerBatch = $maxFilesPerBatch, maxFileAge = $maxFileAgeMs") + logInfo(s"maxFilesPerBatch = $maxFilesP
[2/2] spark-website git commit: add Spark Project Improvement Proposals doc
add Spark Project Improvement Proposals doc Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/39838046 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/39838046 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/39838046 Branch: refs/heads/asf-site Commit: 39838046c1f2cc0c3e00bd08f5130af6f9798aed Parents: c1b9ad3 Author: cody koeninger Authored: Fri Mar 10 13:31:58 2017 -0600 Committer: cody koeninger Committed: Fri Mar 10 13:31:58 2017 -0600 -- _layouts/global.html| 1 + improvement-proposals.md| 91 ++ site/committers.html| 1 + site/community.html | 1 + site/contributing.html | 1 + site/developer-tools.html | 1 + site/documentation.html | 1 + site/downloads.html | 1 + site/examples.html | 1 + site/faq.html | 1 + site/graphx/index.html | 1 + site/improvement-proposals.html | 295 +++ site/index.html | 1 + site/mailing-lists.html | 1 + site/mllib/index.html | 1 + site/news/amp-camp-2013-registration-ope.html | 1 + .../news/announcing-the-first-spark-summit.html | 1 + .../news/fourth-spark-screencast-published.html | 1 + site/news/index.html| 1 + site/news/nsdi-paper.html | 1 + site/news/one-month-to-spark-summit-2015.html | 1 + .../proposals-open-for-spark-summit-east.html | 1 + ...registration-open-for-spark-summit-east.html | 1 + .../news/run-spark-and-shark-on-amazon-emr.html | 1 + site/news/spark-0-6-1-and-0-5-2-released.html | 1 + site/news/spark-0-6-2-released.html | 1 + site/news/spark-0-7-0-released.html | 1 + site/news/spark-0-7-2-released.html | 1 + site/news/spark-0-7-3-released.html | 1 + site/news/spark-0-8-0-released.html | 1 + site/news/spark-0-8-1-released.html | 1 + site/news/spark-0-9-0-released.html | 1 + site/news/spark-0-9-1-released.html | 1 + site/news/spark-0-9-2-released.html | 1 + site/news/spark-1-0-0-released.html | 1 + site/news/spark-1-0-1-released.html | 1 + site/news/spark-1-0-2-released.html | 1 + site/news/spark-1-1-0-released.html | 1 + site/news/spark-1-1-1-released.html | 1 + site/news/spark-1-2-0-released.html | 1 + site/news/spark-1-2-1-released.html | 1 + site/news/spark-1-2-2-released.html | 1 + site/news/spark-1-3-0-released.html | 1 + site/news/spark-1-4-0-released.html | 1 + site/news/spark-1-4-1-released.html | 1 + site/news/spark-1-5-0-released.html | 1 + site/news/spark-1-5-1-released.html | 1 + site/news/spark-1-5-2-released.html | 1 + site/news/spark-1-6-0-released.html | 1 + site/news/spark-1-6-1-released.html | 1 + site/news/spark-1-6-2-released.html | 1 + site/news/spark-1-6-3-released.html | 1 + site/news/spark-2-0-0-released.html | 1 + site/news/spark-2-0-1-released.html | 1 + site/news/spark-2-0-2-released.html | 1 + site/news/spark-2-1-0-released.html | 1 + site/news/spark-2.0.0-preview.html | 1 + .../spark-accepted-into-apache-incubator.html | 1 + site/news/spark-and-shark-in-the-news.html | 1 + site/news/spark-becomes-tlp.html| 1 + site/news/spark-featured-in-wired.html | 1 + .../spark-mailing-lists-moving-to-apache.html | 1 + site/news/spark-meetups.html| 1 + site/news/spark-screencasts-published.html | 1 + site/news/spark-summit-2013-is-a-wrap.html | 1 + site/news/spark-summit-2014-videos-posted.html | 1 + site/news/spark-summit-2015-videos-posted.html | 1 + site/news/spark-summit-agenda-posted.html | 1 + .../spark-summit-east-2015-videos-posted.html | 1 + .../spark-summit-east-2016-cfp-closing.html | 1 + .../spark-summit-east-2017-agenda-posted.html | 1 + site/news/spark-summit-east-agenda-posted.html | 1 + .../news/spark-summit-europe-agenda-posted.html | 1 + site/news/spark-summit-europe.html | 1 + .../spark-summit-june-2016-agenda-posted.html | 1 + site/news/spark-tips-from-quantifind.html | 1 + .../sp
[1/2] spark-website git commit: add Spark Project Improvement Proposals doc
Repository: spark-website Updated Branches: refs/heads/asf-site c1b9ad3cb -> 39838046c http://git-wip-us.apache.org/repos/asf/spark-website/blob/39838046/site/news/spark-summit-east-2016-cfp-closing.html -- diff --git a/site/news/spark-summit-east-2016-cfp-closing.html b/site/news/spark-summit-east-2016-cfp-closing.html index cc43c32..74fde88 100644 --- a/site/news/spark-summit-east-2016-cfp-closing.html +++ b/site/news/spark-summit-east-2016-cfp-closing.html @@ -119,6 +119,7 @@ Mailing Lists & Resources Contributing to Spark + Improvement Proposals (SPIP) https://issues.apache.org/jira/browse/SPARK";>Issue Tracker Powered By Project Committers http://git-wip-us.apache.org/repos/asf/spark-website/blob/39838046/site/news/spark-summit-east-2017-agenda-posted.html -- diff --git a/site/news/spark-summit-east-2017-agenda-posted.html b/site/news/spark-summit-east-2017-agenda-posted.html index 58af016..65d3636 100644 --- a/site/news/spark-summit-east-2017-agenda-posted.html +++ b/site/news/spark-summit-east-2017-agenda-posted.html @@ -119,6 +119,7 @@ Mailing Lists & Resources Contributing to Spark + Improvement Proposals (SPIP) https://issues.apache.org/jira/browse/SPARK";>Issue Tracker Powered By Project Committers http://git-wip-us.apache.org/repos/asf/spark-website/blob/39838046/site/news/spark-summit-east-agenda-posted.html -- diff --git a/site/news/spark-summit-east-agenda-posted.html b/site/news/spark-summit-east-agenda-posted.html index 0bc68c7..9b3f5ba 100644 --- a/site/news/spark-summit-east-agenda-posted.html +++ b/site/news/spark-summit-east-agenda-posted.html @@ -119,6 +119,7 @@ Mailing Lists & Resources Contributing to Spark + Improvement Proposals (SPIP) https://issues.apache.org/jira/browse/SPARK";>Issue Tracker Powered By Project Committers http://git-wip-us.apache.org/repos/asf/spark-website/blob/39838046/site/news/spark-summit-europe-agenda-posted.html -- diff --git a/site/news/spark-summit-europe-agenda-posted.html b/site/news/spark-summit-europe-agenda-posted.html index dfbb20d..176f8f2 100644 --- a/site/news/spark-summit-europe-agenda-posted.html +++ b/site/news/spark-summit-europe-agenda-posted.html @@ -119,6 +119,7 @@ Mailing Lists & Resources Contributing to Spark + Improvement Proposals (SPIP) https://issues.apache.org/jira/browse/SPARK";>Issue Tracker Powered By Project Committers http://git-wip-us.apache.org/repos/asf/spark-website/blob/39838046/site/news/spark-summit-europe.html -- diff --git a/site/news/spark-summit-europe.html b/site/news/spark-summit-europe.html index 9728b5f..a122ab0 100644 --- a/site/news/spark-summit-europe.html +++ b/site/news/spark-summit-europe.html @@ -119,6 +119,7 @@ Mailing Lists & Resources Contributing to Spark + Improvement Proposals (SPIP) https://issues.apache.org/jira/browse/SPARK";>Issue Tracker Powered By Project Committers http://git-wip-us.apache.org/repos/asf/spark-website/blob/39838046/site/news/spark-summit-june-2016-agenda-posted.html -- diff --git a/site/news/spark-summit-june-2016-agenda-posted.html b/site/news/spark-summit-june-2016-agenda-posted.html index 969edb4..6b5e8c5 100644 --- a/site/news/spark-summit-june-2016-agenda-posted.html +++ b/site/news/spark-summit-june-2016-agenda-posted.html @@ -119,6 +119,7 @@ Mailing Lists & Resources Contributing to Spark + Improvement Proposals (SPIP) https://issues.apache.org/jira/browse/SPARK";>Issue Tracker Powered By Project Committers http://git-wip-us.apache.org/repos/asf/spark-website/blob/39838046/site/news/spark-tips-from-quantifind.html -- diff --git a/site/news/spark-tips-from-quantifind.html b/site/news/spark-tips-from-quantifind.html index 546feda..246eb4e 100644 --- a/site/news/spark-tips-from-quantifind.html +++ b/site/news/spark-tips-from-quantifind.html @@ -119,6 +119,7 @@ Mailing Lists & Resources Contributing to Spark + Improvement Proposals (SPIP) https://issues.apache.org/jira/browse/SPARK";>Issue Tracker Powered By Project Committers http://git-wip-us.apache.org/repos/asf/
spark git commit: [SPARK-19922][ML] small speedups to findSynonyms
Repository: spark Updated Branches: refs/heads/master 1c7275efa -> 5e96a57b2 [SPARK-19922][ML] small speedups to findSynonyms Currently generating synonyms using a large model (I've tested with 3m words) is very slow. These efficiencies have sped things up for us by ~17% I wasn't sure if such small changes were worthy of a jira, but the guidelines seemed to suggest that that is the preferred approach ## What changes were proposed in this pull request? Address a few small issues in the findSynonyms logic: 1) remove usage of ``Array.fill`` to zero out the ``cosineVec`` array. The default float value in Scala and Java is 0.0f, so explicitly setting the values to zero is not needed 2) use Floats throughout. The conversion to Doubles before doing the ``priorityQueue`` is totally superfluous, since all the similarity computations are done using Floats anyway. Creating a second large array just serves to put extra strain on the GC 3) convert the slow ``for(i <- cosVec.indices)`` to an ugly, but faster, ``while`` loop These efficiencies are really only apparent when working with a large model ## How was this patch tested? Existing unit tests + some in-house tests to time the difference cc jkbradley MLNick srowen Author: Asher Krim Author: Asher Krim Closes #17263 from Krimit/fasterFindSynonyms. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e96a57b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e96a57b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e96a57b Branch: refs/heads/master Commit: 5e96a57b2f383d4b33735681b41cd3ec06570671 Parents: 1c7275e Author: Asher Krim Authored: Tue Mar 14 13:08:11 2017 + Committer: Sean Owen Committed: Tue Mar 14 13:08:11 2017 + -- .../apache/spark/mllib/feature/Word2Vec.scala | 34 +++- 1 file changed, 19 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5e96a57b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 531c8b0..6f96813 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -491,8 +491,8 @@ class Word2VecModel private[spark] ( // wordVecNorms: Array of length numWords, each value being the Euclidean norm // of the wordVector. - private val wordVecNorms: Array[Double] = { -val wordVecNorms = new Array[Double](numWords) + private val wordVecNorms: Array[Float] = { +val wordVecNorms = new Array[Float](numWords) var i = 0 while (i < numWords) { val vec = wordVectors.slice(i * vectorSize, i * vectorSize + vectorSize) @@ -570,7 +570,7 @@ class Word2VecModel private[spark] ( require(num > 0, "Number of similar words should > 0") val fVector = vector.toArray.map(_.toFloat) -val cosineVec = Array.fill[Float](numWords)(0) +val cosineVec = new Array[Float](numWords) val alpha: Float = 1 val beta: Float = 0 // Normalize input vector before blas.sgemv to avoid Inf value @@ -581,22 +581,23 @@ class Word2VecModel private[spark] ( blas.sgemv( "T", vectorSize, numWords, alpha, wordVectors, vectorSize, fVector, 1, beta, cosineVec, 1) -val cosVec = cosineVec.map(_.toDouble) -var ind = 0 -while (ind < numWords) { - val norm = wordVecNorms(ind) - if (norm == 0.0) { -cosVec(ind) = 0.0 +var i = 0 +while (i < numWords) { + val norm = wordVecNorms(i) + if (norm == 0.0f) { +cosineVec(i) = 0.0f } else { -cosVec(ind) /= norm +cosineVec(i) /= norm } - ind += 1 + i += 1 } -val pq = new BoundedPriorityQueue[(String, Double)](num + 1)(Ordering.by(_._2)) +val pq = new BoundedPriorityQueue[(String, Float)](num + 1)(Ordering.by(_._2)) -for(i <- cosVec.indices) { - pq += Tuple2(wordList(i), cosVec(i)) +var j = 0 +while (j < numWords) { + pq += Tuple2(wordList(j), cosineVec(j)) + j += 1 } val scored = pq.toSeq.sortBy(-_._2) @@ -606,7 +607,10 @@ class Word2VecModel private[spark] ( case None => scored } -filtered.take(num).toArray +filtered + .take(num) + .map { case (word, score) => (word, score.toDouble) } + .toArray } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16599][CORE] java.util.NoSuchElementException: None.get at at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask
Repository: spark Updated Branches: refs/heads/master ccba622e3 -> 54e61df26 [SPARK-16599][CORE] java.util.NoSuchElementException: None.get at at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask ## What changes were proposed in this pull request? Avoid None.get exception in (rare?) case that no readLocks exist Note that while this would resolve the immediate cause of the exception, it's not clear it is the root problem. ## How was this patch tested? Existing tests Author: Sean Owen Closes #17290 from srowen/SPARK-16599. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54e61df2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54e61df2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54e61df2 Branch: refs/heads/master Commit: 54e61df2634163382c7d01a2ad40ffb5e7270abc Parents: ccba622 Author: Sean Owen Authored: Sat Mar 18 18:01:24 2017 +0100 Committer: Sean Owen Committed: Sat Mar 18 18:01:24 2017 +0100 -- .../main/scala/org/apache/spark/storage/BlockInfoManager.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54e61df2/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala index dd8f5ba..490d45d 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.reflect.ClassTag -import com.google.common.collect.ConcurrentHashMultiset +import com.google.common.collect.{ConcurrentHashMultiset, ImmutableMultiset} import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.internal.Logging @@ -340,7 +340,7 @@ private[storage] class BlockInfoManager extends Logging { val blocksWithReleasedLocks = mutable.ArrayBuffer[BlockId]() val readLocks = synchronized { - readLocksByTask.remove(taskAttemptId).get + readLocksByTask.remove(taskAttemptId).getOrElse(ImmutableMultiset.of[BlockId]()) } val writeLocks = synchronized { writeLocksByTask.remove(taskAttemptId).getOrElse(Seq.empty) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20011][ML][DOCS] Clarify documentation for ALS 'rank' parameter
Repository: spark Updated Branches: refs/heads/master d2dcd6792 -> 7620aed82 [SPARK-20011][ML][DOCS] Clarify documentation for ALS 'rank' parameter ## What changes were proposed in this pull request? API documentation and collaborative filtering documentation page changes to clarify inconsistent description of ALS rank parameter. - [DOCS] was previously: "rank is the number of latent factors in the model." - [API] was previously: "rank - number of features to use" This change describes rank in both places consistently as: - "Number of features to use (also referred to as the number of latent factors)" Author: Chris Snow Author: christopher snow Closes #17345 from snowch/SPARK-20011. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7620aed8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7620aed8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7620aed8 Branch: refs/heads/master Commit: 7620aed828d8baefc425b54684a83c81f1507b02 Parents: d2dcd67 Author: christopher snow Authored: Tue Mar 21 13:23:59 2017 + Committer: Sean Owen Committed: Tue Mar 21 13:23:59 2017 + -- docs/mllib-collaborative-filtering.md | 2 +- .../org/apache/spark/mllib/recommendation/ALS.scala | 16 python/pyspark/mllib/recommendation.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7620aed8/docs/mllib-collaborative-filtering.md -- diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md index 0f891a0..d1bb6d6 100644 --- a/docs/mllib-collaborative-filtering.md +++ b/docs/mllib-collaborative-filtering.md @@ -20,7 +20,7 @@ algorithm to learn these latent factors. The implementation in `spark.mllib` has following parameters: * *numBlocks* is the number of blocks used to parallelize computation (set to -1 to auto-configure). -* *rank* is the number of latent factors in the model. +* *rank* is the number of features to use (also referred to as the number of latent factors). * *iterations* is the number of iterations of ALS to run. ALS typically converges to a reasonable solution in 20 iterations or less. * *lambda* specifies the regularization parameter in ALS. http://git-wip-us.apache.org/repos/asf/spark/blob/7620aed8/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index 76b1bc1..1428822 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -301,7 +301,7 @@ object ALS { * level of parallelism. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS * @param lambda regularization parameter * @param blocks level of parallelism to split computation into @@ -326,7 +326,7 @@ object ALS { * level of parallelism. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS * @param lambda regularization parameter * @param blocks level of parallelism to split computation into @@ -349,7 +349,7 @@ object ALS { * parallelism automatically based on the number of partitions in `ratings`. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS * @param lambda regularization parameter */ @@ -366,7 +366,7 @@ object ALS { * parallelism automatically based on the number of partitions in `ratings`. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS */ @Since("0.8.0") @@ -383,7 +383,7 @@ object ALS { * a level of parallelism g
spark git commit: [SPARK-19998][BLOCK MANAGER] Change the exception log to add RDD id of the related the block
Repository: spark Updated Branches: refs/heads/master 7620aed82 -> 650d03cfc [SPARK-19998][BLOCK MANAGER] Change the exception log to add RDD id of the related the block ## What changes were proposed in this pull request? "java.lang.Exception: Could not compute split, block $blockId not found" doesn't have the rdd id info, the "BlockManager: Removing RDD $id" has only the RDD id, so it couldn't find that the Exception's reason is the Removing; so it's better block not found Exception add RDD id info ## How was this patch tested? Existing tests Author: jianran.tfh Author: jianran Closes #17334 from jianran/SPARK-19998. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/650d03cf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/650d03cf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/650d03cf Branch: refs/heads/master Commit: 650d03cfc9a609a2c603f9ced452d03ec8429b0d Parents: 7620aed Author: jianran.tfh Authored: Tue Mar 21 15:15:19 2017 + Committer: Sean Owen Committed: Tue Mar 21 15:15:19 2017 + -- core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/650d03cf/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala index d47b755..4e036c2 100644 --- a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala @@ -47,7 +47,7 @@ class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[Blo blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => -throw new Exception("Could not compute split, block " + blockId + " not found") +throw new Exception(s"Could not compute split, block $blockId of RDD $id not found") } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20021][PYSPARK] Miss backslash in python code
Repository: spark Updated Branches: refs/heads/master 7343a0940 -> facfd6088 [SPARK-20021][PYSPARK] Miss backslash in python code ## What changes were proposed in this pull request? Add backslash for line continuation in python code. ## How was this patch tested? Jenkins. Author: uncleGen Author: dylon Closes #17352 from uncleGen/python-example-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/facfd608 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/facfd608 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/facfd608 Branch: refs/heads/master Commit: facfd608865c385c0dabfe09cffe5874532a9cdf Parents: 7343a09 Author: uncleGen Authored: Wed Mar 22 11:10:08 2017 + Committer: Sean Owen Committed: Wed Mar 22 11:10:08 2017 + -- docs/structured-streaming-programming-guide.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/facfd608/docs/structured-streaming-programming-guide.md -- diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index 7988472..ff07ad1 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -764,11 +764,11 @@ Dataset windowedCounts = words words = ... # streaming DataFrame of schema { timestamp: Timestamp, word: String } # Group the data by window and word and compute the count of each group -windowedCounts = words -.withWatermark("timestamp", "10 minutes") +windowedCounts = words \ +.withWatermark("timestamp", "10 minutes") \ .groupBy( window(words.timestamp, "10 minutes", "5 minutes"), -words.word) +words.word) \ .count() {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20021][PYSPARK] Miss backslash in python code
Repository: spark Updated Branches: refs/heads/branch-2.1 277ed375b -> 56f997f13 [SPARK-20021][PYSPARK] Miss backslash in python code ## What changes were proposed in this pull request? Add backslash for line continuation in python code. ## How was this patch tested? Jenkins. Author: uncleGen Author: dylon Closes #17352 from uncleGen/python-example-doc. (cherry picked from commit facfd608865c385c0dabfe09cffe5874532a9cdf) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56f997f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56f997f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56f997f1 Branch: refs/heads/branch-2.1 Commit: 56f997f1355dc119dfb038d269d8f2f5170f559a Parents: 277ed37 Author: uncleGen Authored: Wed Mar 22 11:10:08 2017 + Committer: Sean Owen Committed: Wed Mar 22 11:10:18 2017 + -- docs/structured-streaming-programming-guide.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/56f997f1/docs/structured-streaming-programming-guide.md -- diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index d316e04..f73cf93 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -788,11 +788,11 @@ Dataset windowedCounts = words words = ... # streaming DataFrame of schema { timestamp: Timestamp, word: String } # Group the data by window and word and compute the count of each group -windowedCounts = words -.withWatermark("timestamp", "10 minutes") +windowedCounts = words \ +.withWatermark("timestamp", "10 minutes") \ .groupBy( window(words.timestamp, "10 minutes", "5 minutes"), -words.word) +words.word) \ .count() {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20027][DOCS] Compilation fix in java docs.
Repository: spark Updated Branches: refs/heads/master facfd6088 -> 0caade634 [SPARK-20027][DOCS] Compilation fix in java docs. ## What changes were proposed in this pull request? During build/sbt publish-local, build breaks due to javadocs errors. This patch fixes those errors. ## How was this patch tested? Tested by running the sbt build. Author: Prashant Sharma Closes #17358 from ScrapCodes/docs-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0caade63 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0caade63 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0caade63 Branch: refs/heads/master Commit: 0caade634076034182e22318eb09a6df1c560576 Parents: facfd60 Author: Prashant Sharma Authored: Wed Mar 22 13:52:03 2017 + Committer: Sean Owen Committed: Wed Mar 22 13:52:03 2017 + -- .../java/org/apache/spark/network/crypto/ClientChallenge.java | 2 +- .../main/java/org/apache/spark/network/crypto/ServerResponse.java | 2 +- .../src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 2 +- .../spark/api/java/function/FlatMapGroupsWithStateFunction.java | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0caade63/common/network-common/src/main/java/org/apache/spark/network/crypto/ClientChallenge.java -- diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/ClientChallenge.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/ClientChallenge.java index 3312a5b..819b8a7 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/ClientChallenge.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/ClientChallenge.java @@ -28,7 +28,7 @@ import org.apache.spark.network.protocol.Encoders; /** * The client challenge message, used to initiate authentication. * - * @see README.md + * Please see crypto/README.md for more details of implementation. */ public class ClientChallenge implements Encodable { /** Serialization tag used to catch incorrect payloads. */ http://git-wip-us.apache.org/repos/asf/spark/blob/0caade63/common/network-common/src/main/java/org/apache/spark/network/crypto/ServerResponse.java -- diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/ServerResponse.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/ServerResponse.java index affdbf4..caf3a0f 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/ServerResponse.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/ServerResponse.java @@ -28,7 +28,7 @@ import org.apache.spark.network.protocol.Encoders; /** * Server's response to client's challenge. * - * @see README.md + * Please see crypto/README.md for more details. */ public class ServerResponse implements Encodable { /** Serialization tag used to catch incorrect payloads. */ http://git-wip-us.apache.org/repos/asf/spark/blob/0caade63/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 4c28075..5437e99 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -863,7 +863,7 @@ public final class UTF8String implements Comparable, Externalizable, * This is done solely for better performance and is not expected to be used by end users. * * {@link LongWrapper} could have been used here but using `int` directly save the extra cost of - * conversion from `long` -> `int` + * conversion from `long` to `int` */ public static class IntWrapper { public int value = 0; http://git-wip-us.apache.org/repos/asf/spark/blob/0caade63/sql/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsWithStateFunction.java -- diff --git a/sql/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsWithStateFunction.java b/sql/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsWithStateFunction.java index 29af78c..bdda8aa 100644 --- a/sql/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsWithStateFunction.java +++ b/sql/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsWithStateFunction.java @@ -28,7 +28
spark git commit: [MINOR][BUILD] Fix javadoc8 break
Repository: spark Updated Branches: refs/heads/master 07c12c09a -> aefe79890 [MINOR][BUILD] Fix javadoc8 break ## What changes were proposed in this pull request? Several javadoc8 breaks have been introduced. This PR proposes fix those instances so that we can build Scala/Java API docs. ``` [error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/GroupState.java:6: error: reference not found [error] * flatMapGroupsWithState operations on {link KeyValueGroupedDataset}. [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/GroupState.java:10: error: reference not found [error] * Both, mapGroupsWithState and flatMapGroupsWithState in {link KeyValueGroupedDataset} [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/GroupState.java:51: error: reference not found [error] *{link GroupStateTimeout.ProcessingTimeTimeout}) or event time (i.e. [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/GroupState.java:52: error: reference not found [error] *{link GroupStateTimeout.EventTimeTimeout}). [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/GroupState.java:158: error: reference not found [error] * Spark SQL types (see {link Encoder} for more details). [error] ^ [error] .../spark/mllib/target/java/org/apache/spark/ml/fpm/FPGrowthParams.java:26: error: bad use of '>' [error]* Number of partitions (>=1) used by parallel FP-growth. By default the param is not set, and [error]^ [error] .../spark/sql/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsWithStateFunction.java:30: error: reference not found [error] * {link org.apache.spark.sql.KeyValueGroupedDataset#flatMapGroupsWithState( [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/KeyValueGroupedDataset.java:211: error: reference not found [error]* See {link GroupState} for more details. [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/KeyValueGroupedDataset.java:232: error: reference not found [error]* See {link GroupState} for more details. [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/KeyValueGroupedDataset.java:254: error: reference not found [error]* See {link GroupState} for more details. [error] ^ [error] .../spark/sql/core/target/java/org/apache/spark/sql/KeyValueGroupedDataset.java:277: error: reference not found [error]* See {link GroupState} for more details. [error] ^ [error] .../spark/core/target/java/org/apache/spark/TaskContextImpl.java:10: error: reference not found [error] * {link TaskMetrics} & {link MetricsSystem} objects are not thread safe. [error] ^ [error] .../spark/core/target/java/org/apache/spark/TaskContextImpl.java:10: error: reference not found [error] * {link TaskMetrics} & {link MetricsSystem} objects are not thread safe. [error] ^ [info] 13 errors ``` ``` jekyll 3.3.1 | Error: Unidoc generation failed ``` ## How was this patch tested? Manually via `jekyll build` Author: hyukjinkwon Closes #17389 from HyukjinKwon/minor-javadoc8-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aefe7989 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aefe7989 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aefe7989 Branch: refs/heads/master Commit: aefe79890541bc0829f184e03eb3961739ca8ef2 Parents: 07c12c0 Author: hyukjinkwon Authored: Thu Mar 23 08:41:30 2017 + Committer: Sean Owen Committed: Thu Mar 23 08:41:30 2017 + -- .../org/apache/spark/TaskContextImpl.scala | 2 +- .../org/apache/spark/ml/fpm/FPGrowth.scala | 4 ++-- .../FlatMapGroupsWithStateFunction.java | 2 +- .../spark/sql/KeyValueGroupedDataset.scala | 8 +++ .../apache/spark/sql/streaming/GroupState.scala | 22 ++-- 5 files changed, 19 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aefe7989/core/src/main/scala/org/apache/spark/TaskContextImpl.scala -- diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala index ea8dcdf..f346cf8 100644 --- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala +++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala @@ -38,7 +38
spark git commit: [INFRA] Close stale PRs
Repository: spark Updated Branches: refs/heads/master aefe79890 -> b70c03a42 [INFRA] Close stale PRs Closes #16819 Closes #13467 Closes #16083 Closes #17135 Closes #8785 Closes #16278 Closes #16997 Closes #17073 Closes #17220 Added: Closes #12059 Closes #12524 Closes #12888 Closes #16061 Author: Sean Owen Closes #17386 from srowen/StalePRs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b70c03a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b70c03a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b70c03a4 Branch: refs/heads/master Commit: b70c03a42002e924e979acbc98a8b464830be532 Parents: aefe798 Author: Sean Owen Authored: Thu Mar 23 08:42:42 2017 + Committer: Sean Owen Committed: Thu Mar 23 08:42:42 2017 + -- -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20078][MESOS] Mesos executor configurability for task name and labels
Repository: spark Updated Branches: refs/heads/master a2ce0a2e3 -> e8ddb91c7 [SPARK-20078][MESOS] Mesos executor configurability for task name and labels ## What changes were proposed in this pull request? Adding configurable mesos executor names and labels using `spark.mesos.task.name` and `spark.mesos.task.labels`. Labels were defined as `k1:v1,k2:v2`. mgummelt ## How was this patch tested? Added unit tests to verify labels were added correctly, with incorrect labels being ignored and added a test to test the name of the executor. Tested with: `./build/sbt -Pmesos mesos/test` Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Kalvin Chau Closes #17404 from kalvinnchau/mesos-config. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8ddb91c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8ddb91c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8ddb91c Branch: refs/heads/master Commit: e8ddb91c7ea5a0b4576cf47aaf969bcc82860b7c Parents: a2ce0a2 Author: Kalvin Chau Authored: Sat Mar 25 10:42:15 2017 + Committer: Sean Owen Committed: Sat Mar 25 10:42:15 2017 + -- .../mesos/MesosCoarseGrainedSchedulerBackend.scala | 3 ++- .../mesos/MesosCoarseGrainedSchedulerBackendSuite.scala | 11 +++ 2 files changed, 13 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e8ddb91c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala -- diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index c049a32..5bdc2a2 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -403,7 +403,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( .setTaskId(TaskID.newBuilder().setValue(taskId.toString).build()) .setSlaveId(offer.getSlaveId) .setCommand(createCommand(offer, taskCPUs + extraCoresPerExecutor, taskId)) -.setName("Task " + taskId) +.setName(s"${sc.appName} $taskId") + taskBuilder.addAllResources(resourcesToUse.asJava) taskBuilder.setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf)) http://git-wip-us.apache.org/repos/asf/spark/blob/e8ddb91c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala -- diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index 98033be..eb83926 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -464,6 +464,17 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite assert(!uris.asScala.head.getCache) } + test("mesos sets task name to spark.app.name") { +setBackend() + +val offers = List(Resources(backend.executorMemory(sc), 1)) +offerResources(offers) +val launchedTasks = verifyTaskLaunched(driver, "o1") + +// Add " 0" to the taskName to match the executor number that is appended +assert(launchedTasks.head.getName == "test-mesos-dynamic-alloc 0") + } + test("mesos supports spark.mesos.network.name") { setBackend(Map( "spark.mesos.network.name" -> "test-network-name" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: logging improvements
Repository: spark Updated Branches: refs/heads/master 93bb0b911 -> 362ee9329 logging improvements ## What changes were proposed in this pull request? Adding additional information to existing logging messages: - YarnAllocator: log the executor ID together with the container id when a container for an executor is launched. - NettyRpcEnv: log the receiver address when there is a timeout waiting for an answer to a remote call. - ExecutorAllocationManager: fix a typo in the logging message for the list of executors to be removed. ## How was this patch tested? Build spark and submit the word count example to a YARN cluster using cluster mode Author: Juan Rodriguez Hortala Closes #17411 from juanrh/logging-improvements. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/362ee932 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/362ee932 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/362ee932 Branch: refs/heads/master Commit: 362ee93296a0de6342b4339e941e6a11f445c5b2 Parents: 93bb0b9 Author: Juan Rodriguez Hortala Authored: Sun Mar 26 10:39:05 2017 +0100 Committer: Sean Owen Committed: Sun Mar 26 10:39:05 2017 +0100 -- .../main/scala/org/apache/spark/ExecutorAllocationManager.scala | 2 +- core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala | 3 ++- .../main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/362ee932/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 1366251..261b332 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -439,7 +439,7 @@ private[spark] class ExecutorAllocationManager( executorsRemoved } else { logWarning(s"Unable to reach the cluster manager to kill executor/s " + -"executorIdsToBeRemoved.mkString(\",\") or no executor eligible to kill!") +s"${executorIdsToBeRemoved.mkString(",")} or no executor eligible to kill!") Seq.empty[String] } } http://git-wip-us.apache.org/repos/asf/spark/blob/362ee932/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala -- diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala index ff5e39a..b316e54 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala @@ -236,7 +236,8 @@ private[netty] class NettyRpcEnv( val timeoutCancelable = timeoutScheduler.schedule(new Runnable { override def run(): Unit = { - onFailure(new TimeoutException(s"Cannot receive any reply in ${timeout.duration}")) + onFailure(new TimeoutException(s"Cannot receive any reply from ${remoteAddr} " + +s"in ${timeout.duration}")) } }, timeout.duration.toNanos, TimeUnit.NANOSECONDS) promise.future.onComplete { v => http://git-wip-us.apache.org/repos/asf/spark/blob/362ee932/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala -- diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index abd2de7..2555676 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -494,7 +494,8 @@ private[yarn] class YarnAllocator( val containerId = container.getId val executorId = executorIdCounter.toString assert(container.getResource.getMemory >= resource.getMemory) - logInfo(s"Launching container $containerId on host $executorHostname") + logInfo(s"Launching container $containerId on host $executorHostname " + +s"for executor with ID $executorId") def updateInternalState(): Unit = synchronized { numExecutorsRunning += 1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark-website git commit: added section about mima
Repository: spark-website Updated Branches: refs/heads/asf-site 39838046c -> 8b27c470c added section about mima Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/8b27c470 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/8b27c470 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/8b27c470 Branch: refs/heads/asf-site Commit: 8b27c470c32b30f6bddafd4cdec2b75b5b975fb6 Parents: 3983804 Author: Benjamin Fradet Authored: Sat Mar 25 21:48:40 2017 + Committer: Benjamin Fradet Committed: Sun Mar 26 18:14:45 2017 +0100 -- developer-tools.md| 56 +- site/developer-tools.html | 51 +- 2 files changed, 105 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/8b27c470/developer-tools.md -- diff --git a/developer-tools.md b/developer-tools.md index 88f3f36..547d8aa 100644 --- a/developer-tools.md +++ b/developer-tools.md @@ -111,6 +111,60 @@ To run individual Java tests, you can use the `-Dtest` flag: build/mvn test -DwildcardSuites=none -Dtest=org.apache.spark.streaming.JavaAPISuite test ``` +Binary compatibility + +To ensure binary compatibility, Spark uses [MiMa](https://github.com/typesafehub/migration-manager). + +Ensuring binary compatibility + +When working on an issue, it's always a good idea to check that your changes do +not introduce binary incompatibilities before opening a pull request. + +You can do so by running the following command: + +``` +$ dev/mima +``` + +A binary incompatibility reported by MiMa might look like the following: + +``` +[error] method this(org.apache.spark.sql.Dataset)Unit in class org.apache.spark.SomeClass does not have a correspondent in current version +[error] filter with: ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SomeClass.this") +``` + +If you open a pull request containing binary incompatibilities anyway, Jenkins +will remind you by failing the test build with the following message: + +``` +Test build #xx has finished for PR yy at commit ff. + + This patch fails MiMa tests. + This patch merges cleanly. + This patch adds no public classes. +``` + +Solving a binary incompatibility + +If you believe that your binary incompatibilies are justified or that MiMa +reported false positives (e.g. the reported binary incompatibilities are about a +non-user facing API), you can filter them out by adding an exclusion in +[project/MimaExcludes.scala](https://github.com/apache/spark/blob/master/project/MimaExcludes.scala) +containing what was suggested by the MiMa report and a comment containing the +JIRA number of the issue you're working on as well as its title. + +For the problem described above, we might add the following: + +{% highlight scala %} +// [SPARK-zz][CORE] Fix an issue +ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SomeClass.this") +{% endhighlight %} + +Otherwise, you will have to resolve those incompatibilies before opening or +updating your pull request. Usually, the problems reported by MiMa are +self-explanatory and revolve around missing members (methods or fields) that +you will have to add back in order to maintain binary compatibility. + Checking Out Pull Requests Git provides a mechanism for fetching remote pull requests into your own local repository. @@ -181,7 +235,7 @@ It is due to an incorrect Scala library in the classpath. To fix it: - Remove `scala-library-2.10.4.jar - lib_managed\jars` In the event of "Could not find resource path for Web UI: org/apache/spark/ui/static", -it's due to a classpath issue (some classes were probably not compiled). To fix this, it +it's due to a classpath issue (some classes were probably not compiled). To fix this, it is sufficient to run a test from the command line: ``` http://git-wip-us.apache.org/repos/asf/spark-website/blob/8b27c470/site/developer-tools.html -- diff --git a/site/developer-tools.html b/site/developer-tools.html index d09815d..a44bfde 100644 --- a/site/developer-tools.html +++ b/site/developer-tools.html @@ -287,6 +287,55 @@ restart whenever build/mvn is called. build/mvn test -DwildcardSuites=none -Dtest=org.apache.spark.streaming.JavaAPISuite test +Binary compatibility + +To ensure binary compatibility, Spark uses https://github.com/typesafehub/migration-manager";>MiMa. + +Ensuring binary compatibility + +When working on an issue, it’s always a good idea to check that your changes do +not introduce binary incompatibilities before opening a pull request. + +You can do so by running the following command:
spark git commit: [SPARK-20107][DOC] Add spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version option to configuration.md
Repository: spark Updated Branches: refs/heads/master 471de5db5 -> edc87d76e [SPARK-20107][DOC] Add spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version option to configuration.md ## What changes were proposed in this pull request? Add `spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version` option to `configuration.md`. Set `spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2` can speed up [HadoopMapReduceCommitProtocol.commitJob](https://github.com/apache/spark/blob/v2.1.0/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala#L121) for many output files. All cloudera's hadoop 2.6.0-cdh5.4.0 or higher versions(see: https://github.com/cloudera/hadoop-common/commit/1c1236182304d4075276c00c4592358f428bc433 and https://github.com/cloudera/hadoop-common/commit/16b2de27321db7ce2395c08baccfdec5562017f0) and apache's hadoop 2.7.0 or higher versions support this improvement. More see: 1. [MAPREDUCE-4815](https://issues.apache.org/jira/browse/MAPREDUCE-4815): Speed up FileOutputCommitter#commitJob for many output files. 2. [MAPREDUCE-6406](https://issues.apache.org/jira/browse/MAPREDUCE-6406): Update the default version for the property mapreduce.fileoutputcommitter.algorithm.version to 2. ## How was this patch tested? Manual test and exist tests. Author: Yuming Wang Closes #17442 from wangyum/SPARK-20107. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/edc87d76 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/edc87d76 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/edc87d76 Branch: refs/heads/master Commit: edc87d76efea7b4d19d9d0c4ddba274a3ccb8752 Parents: 471de5d Author: Yuming Wang Authored: Thu Mar 30 10:39:57 2017 +0100 Committer: Sean Owen Committed: Thu Mar 30 10:39:57 2017 +0100 -- docs/configuration.md | 9 + 1 file changed, 9 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/edc87d76/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 4729f1b..a975392 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1137,6 +1137,15 @@ Apart from these, the following properties are also available, and may be useful mapping has high overhead for blocks close to or below the page size of the operating system. + + spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version + 1 + +The file output committer algorithm version, valid algorithm version number: 1 or 2. +Version 2 may have better performance, but version 1 may handle failures better in certain situations, +as per https://issues.apache.org/jira/browse/MAPREDUCE-4815";>MAPREDUCE-4815. + + ### Networking - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOCS] Docs-only improvements
Repository: spark Updated Branches: refs/heads/master b454d4402 -> 0197262a3 [DOCS] Docs-only improvements â¦adoc ## What changes were proposed in this pull request? Use recommended values for row boundaries in Window's scaladoc, i.e. `Window.unboundedPreceding`, `Window.unboundedFollowing`, and `Window.currentRow` (that were introduced in 2.1.0). ## How was this patch tested? Local build Author: Jacek Laskowski Closes #17417 from jaceklaskowski/window-expression-scaladoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0197262a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0197262a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0197262a Branch: refs/heads/master Commit: 0197262a358fd174a188f8246ae777e53157610e Parents: b454d44 Author: Jacek Laskowski Authored: Thu Mar 30 16:07:27 2017 +0100 Committer: Sean Owen Committed: Thu Mar 30 16:07:27 2017 +0100 -- .../org/apache/spark/memory/MemoryConsumer.java | 2 -- .../sort/BypassMergeSortShuffleWriter.java | 5 ++--- .../apache/spark/ExecutorAllocationClient.scala | 5 ++--- .../scala/org/apache/spark/scheduler/Task.scala | 2 +- .../apache/spark/serializer/Serializer.scala| 2 +- .../spark/shuffle/BlockStoreShuffleReader.scala | 3 +-- .../shuffle/IndexShuffleBlockResolver.scala | 4 ++-- .../spark/shuffle/sort/SortShuffleManager.scala | 4 ++-- .../org/apache/spark/util/AccumulatorV2.scala | 2 +- .../spark/examples/ml/DataFrameExample.scala| 2 +- .../org/apache/spark/ml/stat/Correlation.scala | 2 +- .../sql/catalyst/analysis/ResolveHints.scala| 2 +- .../catalyst/encoders/ExpressionEncoder.scala | 6 ++--- .../sql/catalyst/expressions/Expression.scala | 2 +- .../expressions/windowExpressions.scala | 2 +- .../spark/sql/catalyst/optimizer/objects.scala | 2 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 6 ++--- .../spark/sql/catalyst/plans/QueryPlan.scala| 5 +++-- .../catalyst/plans/logical/LogicalPlan.scala| 2 +- .../catalyst/parser/ExpressionParserSuite.scala | 3 ++- .../scala/org/apache/spark/sql/Column.scala | 18 +++ .../org/apache/spark/sql/DatasetHolder.scala| 3 ++- .../org/apache/spark/sql/SparkSession.scala | 2 +- .../spark/sql/execution/command/databases.scala | 2 +- .../spark/sql/execution/streaming/Source.scala | 2 +- .../apache/spark/sql/expressions/Window.scala | 23 ++-- .../spark/sql/expressions/WindowSpec.scala | 20 - .../scala/org/apache/spark/sql/functions.scala | 2 +- .../sql/hive/HiveSessionStateBuilder.scala | 2 +- .../streaming/scheduler/InputInfoTracker.scala | 2 +- 30 files changed, 68 insertions(+), 71 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0197262a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java -- diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java index fc1f3a8..48cf4b9 100644 --- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java +++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java @@ -60,8 +60,6 @@ public abstract class MemoryConsumer { /** * Force spill during building. - * - * For testing. */ public void spill() throws IOException { spill(Long.MAX_VALUE, this); http://git-wip-us.apache.org/repos/asf/spark/blob/0197262a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java -- diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java index 4a15559..323a5d3 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java @@ -52,8 +52,7 @@ import org.apache.spark.util.Utils; * This class implements sort-based shuffle's hash-style shuffle fallback path. This write path * writes incoming records to separate files, one file per reduce partition, then concatenates these * per-partition files to form a single output file, regions of which are served to reducers. - * Records are not buffered in memory. This is essentially identical to - * {@link org.apache.spark.shuffle.hash.HashShuffleWriter}, except that it writes output in a format + * Records are not buffered in memory. It writes output in a format * that can be served / consumed via {@link org.apache.spark.shuffle.IndexShuffleBlockResolver}. * * This write path is in
spark git commit: [SPARK-19999] Workaround JDK-8165231 to identify PPC64 architectures as supporting unaligned access
Repository: spark Updated Branches: refs/heads/master 0197262a3 -> 258bff2c3 [SPARK-1] Workaround JDK-8165231 to identify PPC64 architectures as supporting unaligned access java.nio.Bits.unaligned() does not return true for the ppc64le arch. see https://bugs.openjdk.java.net/browse/JDK-8165231 ## What changes were proposed in this pull request? check architecture ## How was this patch tested? unit test Author: samelamin Author: samelamin Closes #17472 from samelamin/SPARK-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/258bff2c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/258bff2c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/258bff2c Branch: refs/heads/master Commit: 258bff2c3f54490ddca898e276029db9adf575d9 Parents: 0197262 Author: samelamin Authored: Thu Mar 30 16:08:26 2017 +0100 Committer: Sean Owen Committed: Thu Mar 30 16:08:26 2017 +0100 -- .../java/org/apache/spark/unsafe/Platform.java | 28 +++- 1 file changed, 16 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/258bff2c/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java index f13c24a..1321b83 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java @@ -46,18 +46,22 @@ public final class Platform { private static final boolean unaligned; static { boolean _unaligned; -// use reflection to access unaligned field -try { - Class bitsClass = -Class.forName("java.nio.Bits", false, ClassLoader.getSystemClassLoader()); - Method unalignedMethod = bitsClass.getDeclaredMethod("unaligned"); - unalignedMethod.setAccessible(true); - _unaligned = Boolean.TRUE.equals(unalignedMethod.invoke(null)); -} catch (Throwable t) { - // We at least know x86 and x64 support unaligned access. - String arch = System.getProperty("os.arch", ""); - //noinspection DynamicRegexReplaceableByCompiledPattern - _unaligned = arch.matches("^(i[3-6]86|x86(_64)?|x64|amd64|aarch64)$"); +String arch = System.getProperty("os.arch", ""); +if (arch.equals("ppc64le") || arch.equals("ppc64")) { + // Since java.nio.Bits.unaligned() doesn't return true on ppc (See JDK-8165231), but ppc64 and ppc64le support it + _unaligned = true; +} else { + try { +Class bitsClass = + Class.forName("java.nio.Bits", false, ClassLoader.getSystemClassLoader()); +Method unalignedMethod = bitsClass.getDeclaredMethod("unaligned"); +unalignedMethod.setAccessible(true); +_unaligned = Boolean.TRUE.equals(unalignedMethod.invoke(null)); + } catch (Throwable t) { +// We at least know x86 and x64 support unaligned access. +//noinspection DynamicRegexReplaceableByCompiledPattern +_unaligned = arch.matches("^(i[3-6]86|x86(_64)?|x64|amd64|aarch64)$"); + } } unaligned = _unaligned; } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOCS][MINOR] Fixed a few typos in the Structured Streaming documentation
Repository: spark Updated Branches: refs/heads/master e9d268f63 -> 669a11b61 [DOCS][MINOR] Fixed a few typos in the Structured Streaming documentation Fixed a few typos. There is one more I'm not sure of: ``` Append mode uses watermark to drop old aggregation state. But the output of a windowed aggregation is delayed the late threshold specified in `withWatermark()` as by the modes semantics, rows can be added to the Result Table only once after they are ``` Not sure how to change `is delayed the late threshold`. Author: Seigneurin, Alexis (CONT) Closes #17443 from aseigneurin/typos. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/669a11b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/669a11b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/669a11b6 Branch: refs/heads/master Commit: 669a11b61bc217a13217f1ef48d781329c45575e Parents: e9d268f Author: Seigneurin, Alexis (CONT) Authored: Thu Mar 30 16:12:17 2017 +0100 Committer: Sean Owen Committed: Thu Mar 30 16:12:17 2017 +0100 -- docs/structured-streaming-programming-guide.md | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/669a11b6/docs/structured-streaming-programming-guide.md -- diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index ff07ad1..b5cf9f1 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -717,11 +717,11 @@ However, to run this query for days, it's necessary for the system to bound the intermediate in-memory state it accumulates. This means the system needs to know when an old aggregate can be dropped from the in-memory state because the application is not going to receive late data for that aggregate any more. To enable this, in Spark 2.1, we have introduced -**watermarking**, which let's the engine automatically track the current event time in the data and +**watermarking**, which lets the engine automatically track the current event time in the data and attempt to clean up old state accordingly. You can define the watermark of a query by -specifying the event time column and the threshold on how late the data is expected be in terms of +specifying the event time column and the threshold on how late the data is expected to be in terms of event time. For a specific window starting at time `T`, the engine will maintain state and allow late -data to be update the state until `(max event time seen by the engine - late threshold > T)`. +data to update the state until `(max event time seen by the engine - late threshold > T)`. In other words, late data within the threshold will be aggregated, but data later than the threshold will be dropped. Let's understand this with an example. We can easily define watermarking on the previous example using `withWatermark()` as shown below. @@ -792,7 +792,7 @@ This watermark lets the engine maintain intermediate state for additional 10 min data to be counted. For example, the data `(12:09, cat)` is out of order and late, and it falls in windows `12:05 - 12:15` and `12:10 - 12:20`. Since, it is still ahead of the watermark `12:04` in the trigger, the engine still maintains the intermediate counts as state and correctly updates the -counts of the related windows. However, when the watermark is updated to 12:11, the intermediate +counts of the related windows. However, when the watermark is updated to `12:11`, the intermediate state for window `(12:00 - 12:10)` is cleared, and all subsequent data (e.g. `(12:04, donkey)`) is considered "too late" and therefore ignored. Note that after every trigger, the updated counts (i.e. purple rows) are written to sink as the trigger output, as dictated by @@ -825,7 +825,7 @@ section for detailed explanation of the semantics of each output mode. same column as the timestamp column used in the aggregate. For example, `df.withWatermark("time", "1 min").groupBy("time2").count()` is invalid in Append output mode, as watermark is defined on a different column -as the aggregation column. +from the aggregation column. - `withWatermark` must be called before the aggregation for the watermark details to be used. For example, `df.groupBy("time").count().withWatermark("time", "1 min")` is invalid in Append @@ -909,7 +909,7 @@ track of all the data received in the stream. This is therefore fundamentally ha efficiently. ## Starting Streaming Queries -Once you have defined the final result DataFrame/Dataset, all that is left is for you start the streaming computation. To do that, you have
spark git commit: [SPARK-20127][CORE] few warning have been fixed which Intellij IDEA reported Intellij IDEA
Repository: spark Updated Branches: refs/heads/master 669a11b61 -> 5e00a5de1 [SPARK-20127][CORE] few warning have been fixed which Intellij IDEA reported Intellij IDEA ## What changes were proposed in this pull request? Few changes related to Intellij IDEA inspection. ## How was this patch tested? Changes were tested by existing unit tests Author: Denis Bolshakov Closes #17458 from dbolshak/SPARK-20127. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e00a5de Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e00a5de Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e00a5de Branch: refs/heads/master Commit: 5e00a5de14ae2d80471c6f38c30cc6fe63e05163 Parents: 669a11b Author: Denis Bolshakov Authored: Thu Mar 30 16:15:40 2017 +0100 Committer: Sean Owen Committed: Thu Mar 30 16:15:40 2017 +0100 -- .../java/org/apache/spark/memory/TaskMemoryManager.java | 6 +- .../java/org/apache/spark/status/api/v1/TaskSorting.java | 5 ++--- .../main/scala/org/apache/spark/io/CompressionCodec.scala | 3 +-- core/src/main/scala/org/apache/spark/ui/WebUI.scala | 2 +- .../org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala | 2 +- .../scala/org/apache/spark/ui/exec/ExecutorsPage.scala| 3 +-- .../scala/org/apache/spark/ui/exec/ExecutorsTab.scala | 4 ++-- .../scala/org/apache/spark/ui/jobs/AllStagesPage.scala| 4 ++-- .../scala/org/apache/spark/ui/jobs/ExecutorTable.scala| 4 ++-- .../org/apache/spark/ui/jobs/JobProgressListener.scala| 4 ++-- .../main/scala/org/apache/spark/ui/jobs/StagePage.scala | 10 +- .../main/scala/org/apache/spark/ui/jobs/StageTable.scala | 2 +- .../scala/org/apache/spark/ui/storage/StoragePage.scala | 2 +- 13 files changed, 22 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5e00a5de/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java -- diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java index 39fb3b2..aa0b373 100644 --- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java +++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java @@ -155,11 +155,7 @@ public class TaskMemoryManager { for (MemoryConsumer c: consumers) { if (c != consumer && c.getUsed() > 0 && c.getMode() == mode) { long key = c.getUsed(); -List list = sortedConsumers.get(key); -if (list == null) { - list = new ArrayList<>(1); - sortedConsumers.put(key, list); -} +List list = sortedConsumers.computeIfAbsent(key, k -> new ArrayList<>(1)); list.add(c); } } http://git-wip-us.apache.org/repos/asf/spark/blob/5e00a5de/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java -- diff --git a/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java b/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java index 9307eb9..b38639e 100644 --- a/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java +++ b/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java @@ -19,6 +19,7 @@ package org.apache.spark.status.api.v1; import org.apache.spark.util.EnumUtil; +import java.util.Collections; import java.util.HashSet; import java.util.Set; @@ -30,9 +31,7 @@ public enum TaskSorting { private final Set alternateNames; TaskSorting(String... names) { alternateNames = new HashSet<>(); -for (String n: names) { - alternateNames.add(n); -} +Collections.addAll(alternateNames, names); } public static TaskSorting fromString(String str) { http://git-wip-us.apache.org/repos/asf/spark/blob/5e00a5de/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala -- diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala index 2e991ce..c216fe4 100644 --- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala +++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala @@ -71,8 +71,7 @@ private[spark] object CompressionCodec { val ctor = Utils.classForName(codecClass).getConstructor(classOf[SparkConf]) Some(ctor.newInstance(conf).asInstanceOf[CompressionCodec]) } catch { - case e: ClassNotFoundException => None - case e: IllegalArgumentException => None + case _: ClassNotFoundException | _: IllegalArgumentException => None }
spark git commit: [SPARK-20096][SPARK SUBMIT][MINOR] Expose the right queue name not null if set by --conf or configure file
Repository: spark Updated Branches: refs/heads/master 258bff2c3 -> e9d268f63 [SPARK-20096][SPARK SUBMIT][MINOR] Expose the right queue name not null if set by --conf or configure file ## What changes were proposed in this pull request? while submit apps with -v or --verboseï¼ we can print the right queue name, but if we set a queue name with `spark.yarn.queue` by --conf or in the spark-default.conf, we just got `null` for the queue in Parsed arguments. ``` bin/spark-shell -v --conf spark.yarn.queue=thequeue Using properties file: /home/hadoop/spark-2.1.0-bin-apache-hdp2.7.3/conf/spark-defaults.conf Adding default property: spark.yarn.queue=default Parsed arguments: master yarn deployMode client ... queue null verbose true Spark properties used, including those specified through --conf and those from the properties file /home/hadoop/spark-2.1.0-bin-apache-hdp2.7.3/conf/spark-defaults.conf: spark.yarn.queue -> thequeue ``` ## How was this patch tested? ut and local verify Author: Kent Yao Closes #17430 from yaooqinn/SPARK-20096. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9d268f6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9d268f6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9d268f6 Branch: refs/heads/master Commit: e9d268f63e7308486739aa56ece02815bfb432d6 Parents: 258bff2 Author: Kent Yao Authored: Thu Mar 30 16:11:03 2017 +0100 Committer: Sean Owen Committed: Thu Mar 30 16:11:03 2017 +0100 -- .../org/apache/spark/deploy/SparkSubmitArguments.scala | 1 + .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 11 +++ 2 files changed, 12 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e9d268f6/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 0614d80..0144fd1 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -190,6 +190,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S .orNull numExecutors = Option(numExecutors) .getOrElse(sparkProperties.get("spark.executor.instances").orNull) +queue = Option(queue).orElse(sparkProperties.get("spark.yarn.queue")).orNull keytab = Option(keytab).orElse(sparkProperties.get("spark.yarn.keytab")).orNull principal = Option(principal).orElse(sparkProperties.get("spark.yarn.principal")).orNull http://git-wip-us.apache.org/repos/asf/spark/blob/e9d268f6/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index a591b98..7c2ec01 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -148,6 +148,17 @@ class SparkSubmitSuite appArgs.childArgs should be (Seq("--master", "local", "some", "--weird", "args")) } + test("print the right queue name") { +val clArgs = Seq( + "--name", "myApp", + "--class", "Foo", + "--conf", "spark.yarn.queue=thequeue", + "userjar.jar") +val appArgs = new SparkSubmitArguments(clArgs) +appArgs.queue should be ("thequeue") +appArgs.toString should include ("thequeue") + } + test("specify deploy mode through configuration") { val clArgs = Seq( "--master", "yarn", - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20177] Document about compression way has some little detail ch…
Repository: spark Updated Branches: refs/heads/master 567a50acf -> cf5963c96 [SPARK-20177] Document about compression way has some little detail ch⦠â¦anges. ## What changes were proposed in this pull request? Document compression way little detail changes. 1.spark.eventLog.compress add 'Compression will use spark.io.compression.codec.' 2.spark.broadcast.compress add 'Compression will use spark.io.compression.codec.' 3,spark.rdd.compress add 'Compression will use spark.io.compression.codec.' 4.spark.io.compression.codec add 'event log describe'. eg Through the documents, I don't know what is compression mode about 'event log'. ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: éå°é¾ 10207633 Closes #17498 from guoxiaolongzte/SPARK-20177. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf5963c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf5963c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf5963c9 Branch: refs/heads/master Commit: cf5963c961e7eba37bdd58658ed4dfff66ce3c72 Parents: 567a50a Author: éå°é¾ 10207633 Authored: Sat Apr 1 11:48:58 2017 +0100 Committer: Sean Owen Committed: Sat Apr 1 11:48:58 2017 +0100 -- docs/configuration.md | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf5963c9/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index a975392..2687f54 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -639,6 +639,7 @@ Apart from these, the following properties are also available, and may be useful false Whether to compress logged events, if spark.eventLog.enabled is true. +Compression will use spark.io.compression.codec. @@ -773,14 +774,15 @@ Apart from these, the following properties are also available, and may be useful true Whether to compress broadcast variables before sending them. Generally a good idea. +Compression will use spark.io.compression.codec. spark.io.compression.codec lz4 -The codec used to compress internal data such as RDD partitions, broadcast variables and -shuffle outputs. By default, Spark provides three codecs: lz4, lzf, +The codec used to compress internal data such as RDD partitions, event log, broadcast variables +and shuffle outputs. By default, Spark provides three codecs: lz4, lzf, and snappy. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, @@ -881,6 +883,7 @@ Apart from these, the following properties are also available, and may be useful StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. +Compression will use spark.io.compression.codec. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20123][BUILD] SPARK_HOME variable might have spaces in it(e.g. $SPARK…
Repository: spark Updated Branches: refs/heads/master d40cbb861 -> 76de2d115 [SPARK-20123][BUILD] SPARK_HOME variable might have spaces in it(e.g. $SPARK⦠JIRA Issue: https://issues.apache.org/jira/browse/SPARK-20123 ## What changes were proposed in this pull request? If $SPARK_HOME or $FWDIR variable contains spaces, then use "./dev/make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.7 -Phive -Phive-thriftserver -Pmesos -Pyarn" build spark will failed. ## How was this patch tested? manual tests Author: zuotingbing Closes #17452 from zuotingbing/spark-bulid. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/76de2d11 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/76de2d11 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/76de2d11 Branch: refs/heads/master Commit: 76de2d115364aa6a1fdaacdfae05f0c695c953b8 Parents: d40cbb8 Author: zuotingbing Authored: Sun Apr 2 15:31:13 2017 +0100 Committer: Sean Owen Committed: Sun Apr 2 15:31:13 2017 +0100 -- R/check-cran.sh | 20 ++-- R/create-docs.sh| 10 +- R/create-rd.sh | 8 R/install-dev.sh| 14 +++--- R/install-source-package.sh | 20 ++-- dev/make-distribution.sh| 32 6 files changed, 52 insertions(+), 52 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/76de2d11/R/check-cran.sh -- diff --git a/R/check-cran.sh b/R/check-cran.sh index a188b14..22cc9c6 100755 --- a/R/check-cran.sh +++ b/R/check-cran.sh @@ -20,18 +20,18 @@ set -o pipefail set -e -FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)" -pushd $FWDIR > /dev/null +FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)" +pushd "$FWDIR" > /dev/null -. $FWDIR/find-r.sh +. "$FWDIR/find-r.sh" # Install the package (this is required for code in vignettes to run when building it later) # Build the latest docs, but not vignettes, which is built with the package next -. $FWDIR/install-dev.sh +. "$FWDIR/install-dev.sh" # Build source package with vignettes SPARK_HOME="$(cd "${FWDIR}"/..; pwd)" -. "${SPARK_HOME}"/bin/load-spark-env.sh +. "${SPARK_HOME}/bin/load-spark-env.sh" if [ -f "${SPARK_HOME}/RELEASE" ]; then SPARK_JARS_DIR="${SPARK_HOME}/jars" else @@ -40,16 +40,16 @@ fi if [ -d "$SPARK_JARS_DIR" ]; then # Build a zip file containing the source package with vignettes - SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg + SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/R" CMD build "$FWDIR/pkg" find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete else - echo "Error Spark JARs not found in $SPARK_HOME" + echo "Error Spark JARs not found in '$SPARK_HOME'" exit 1 fi # Run check as-cran. -VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'` +VERSION=`grep Version "$FWDIR/pkg/DESCRIPTION" | awk '{print $NF}'` CRAN_CHECK_OPTIONS="--as-cran" @@ -67,10 +67,10 @@ echo "Running CRAN check with $CRAN_CHECK_OPTIONS options" if [ -n "$NO_TESTS" ] && [ -n "$NO_MANUAL" ] then - "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz + "$R_SCRIPT_PATH/R" CMD check $CRAN_CHECK_OPTIONS "SparkR_$VERSION.tar.gz" else # This will run tests and/or build vignettes, and require SPARK_HOME - SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz + SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/R" CMD check $CRAN_CHECK_OPTIONS "SparkR_$VERSION.tar.gz" fi popd > /dev/null http://git-wip-us.apache.org/repos/asf/spark/blob/76de2d11/R/create-docs.sh -- diff --git a/R/create-docs.sh b/R/create-docs.sh index 6bef7e7..310dbc5 100755 --- a/R/create-docs.sh +++ b/R/create-docs.sh @@ -33,15 +33,15 @@ export FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)" export SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)" # Required for setting SPARK_SCALA_VERSION -. "${SPARK_HOME}"/bin/load-spark-env.sh +. "${SPARK_HOME}/bin/load-spark-env.sh" echo "Using Scala $SPARK_SCALA_VERSION" -pushd $FWDIR > /dev/null -. $FWDIR/find-r.sh +pushd "$FWDIR" > /dev/null +. "$FWDIR/find-r.sh" # Install the package (this will also generate the Rd files) -. $FWDIR/install-dev.sh +. "$FWDIR/install-dev.sh" # Now create HTML files @@ -49,7 +49,7 @@ pushd $FWDIR > /dev/null mkdir -p pkg/html pushd pkg/html -"$R_SCRIPT_PATH/"Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))' +"$R_S
spark git commit: [SPARK-19999][BACKPORT-2.1][CORE] Workaround JDK-8165231 to identify PPC64 architectures as supporting unaligned access
Repository: spark Updated Branches: refs/heads/branch-2.1 e3cec18e1 -> 968eace85 [SPARK-1][BACKPORT-2.1][CORE] Workaround JDK-8165231 to identify PPC64 architectures as supporting unaligned access ## What changes were proposed in this pull request? This PR is backport of #17472 to Spark 2.1 java.nio.Bits.unaligned() does not return true for the ppc64le arch. see [https://bugs.openjdk.java.net/browse/JDK-8165231](https://bugs.openjdk.java.net/browse/JDK-8165231) Check architecture in Platform.java ## How was this patch tested? unit test Author: Kazuaki Ishizaki Closes #17509 from kiszk/branch-2.1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/968eace8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/968eace8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/968eace8 Branch: refs/heads/branch-2.1 Commit: 968eace85005d265cb8ff9d3f4aa2d20db58f8fe Parents: e3cec18 Author: Kazuaki Ishizaki Authored: Sun Apr 2 15:33:48 2017 +0100 Committer: Sean Owen Committed: Sun Apr 2 15:33:48 2017 +0100 -- .../java/org/apache/spark/unsafe/Platform.java | 28 +++- 1 file changed, 16 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/968eace8/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java index 671b8c7..ba35cf2 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java @@ -46,18 +46,22 @@ public final class Platform { private static final boolean unaligned; static { boolean _unaligned; -// use reflection to access unaligned field -try { - Class bitsClass = -Class.forName("java.nio.Bits", false, ClassLoader.getSystemClassLoader()); - Method unalignedMethod = bitsClass.getDeclaredMethod("unaligned"); - unalignedMethod.setAccessible(true); - _unaligned = Boolean.TRUE.equals(unalignedMethod.invoke(null)); -} catch (Throwable t) { - // We at least know x86 and x64 support unaligned access. - String arch = System.getProperty("os.arch", ""); - //noinspection DynamicRegexReplaceableByCompiledPattern - _unaligned = arch.matches("^(i[3-6]86|x86(_64)?|x64|amd64|aarch64)$"); +String arch = System.getProperty("os.arch", ""); +if (arch.equals("ppc64le") || arch.equals("ppc64")) { + // Since java.nio.Bits.unaligned() doesn't return true on ppc (See JDK-8165231), but ppc64 and ppc64le support it + _unaligned = true; +} else { + try { +Class bitsClass = + Class.forName("java.nio.Bits", false, ClassLoader.getSystemClassLoader()); +Method unalignedMethod = bitsClass.getDeclaredMethod("unaligned"); +unalignedMethod.setAccessible(true); +_unaligned = Boolean.TRUE.equals(unalignedMethod.invoke(null)); + } catch (Throwable t) { +// We at least know x86 and x64 support unaligned access. +//noinspection DynamicRegexReplaceableByCompiledPattern +_unaligned = arch.matches("^(i[3-6]86|x86(_64)?|x64|amd64|aarch64)$"); + } } unaligned = _unaligned; } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20173][SQL][HIVE-THRIFTSERVER] Throw NullPointerException when HiveThriftServer2 is shutdown
Repository: spark Updated Branches: refs/heads/master 76de2d115 -> 657cb9541 [SPARK-20173][SQL][HIVE-THRIFTSERVER] Throw NullPointerException when HiveThriftServer2 is shutdown ## What changes were proposed in this pull request? If the shutdown hook called before the variable `uiTab` is set , it will throw a NullPointerException. ## How was this patch tested? manual tests Author: zuotingbing Closes #17496 from zuotingbing/SPARK-HiveThriftServer2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/657cb954 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/657cb954 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/657cb954 Branch: refs/heads/master Commit: 657cb9541db8508ce64d08cc3de14cd02adf16b5 Parents: 76de2d1 Author: zuotingbing Authored: Sun Apr 2 15:39:51 2017 +0100 Committer: Sean Owen Committed: Sun Apr 2 15:39:51 2017 +0100 -- .../org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/657cb954/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala -- diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala index 13c6f11..1455360 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala @@ -46,7 +46,7 @@ import org.apache.spark.util.{ShutdownHookManager, Utils} */ object HiveThriftServer2 extends Logging { var LOG = LogFactory.getLog(classOf[HiveServer2]) - var uiTab: Option[ThriftServerTab] = _ + var uiTab: Option[ThriftServerTab] = None var listener: HiveThriftServer2Listener = _ /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20166][SQL] Use XXX for ISO 8601 timezone instead of ZZ (FastDateFormat specific) in CSV/JSON timeformat options
Repository: spark Updated Branches: refs/heads/master 2a903a1ee -> cff11fd20 [SPARK-20166][SQL] Use XXX for ISO 8601 timezone instead of ZZ (FastDateFormat specific) in CSV/JSON timeformat options ## What changes were proposed in this pull request? This PR proposes to use `XXX` format instead of `ZZ`. `ZZ` seems a `FastDateFormat` specific. `ZZ` supports "ISO 8601 extended format time zones" but it seems `FastDateFormat` specific option. I misunderstood this is compatible format with `SimpleDateFormat` when this change is introduced. Please see [SimpleDateFormat documentation]( https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html#iso8601timezone) and [FastDateFormat documentation](https://commons.apache.org/proper/commons-lang/apidocs/org/apache/commons/lang3/time/FastDateFormat.html). It seems we better replace `ZZ` to `XXX` because they look using the same strategy - [FastDateParser.java#L930](https://github.com/apache/commons-lang/blob/8767cd4f1a6af07093c1e6c422dae8e574be7e5e/src/main/java/org/apache/commons/lang3/time/FastDateParser.java#L930), [FastDateParser.java#L932-L951 ](https://github.com/apache/commons-lang/blob/8767cd4f1a6af07093c1e6c422dae8e574be7e5e/src/main/java/org/apache/commons/lang3/time/FastDateParser.java#L932-L951) and [FastDateParser.java#L596-L601](https://github.com/apache/commons-lang/blob/8767cd4f1a6af07093c1e6c422dae8e574be7e5e/src/main/java/org/apache/commons/lang3/time/FastDateParser.java#L596-L601). I also checked the codes and manually debugged it for sure. It seems both cases use the same pattern `( Z|(?:[+-]\\d{2}(?::)\\d{2}))`. _Note that this should be rather a fix about documentation and not the behaviour change because `ZZ` seems invalid date format in `SimpleDateFormat` as documented in `DataFrameReader` and etc, and both `ZZ` and `XXX` look identically working with `FastDateFormat`_ Current documentation is as below: ``` * `timestampFormat` (default `-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at * `java.text.SimpleDateFormat`. This applies to timestamp type. ``` ## How was this patch tested? Existing tests should cover this. Also, manually tested as below (BTW, I don't think these are worth being added as tests within Spark): **Parse** ```scala scala> new java.text.SimpleDateFormat("-MM-dd'T'HH:mm:ss.SSSXXX").parse("2017-03-21T00:00:00.000-11:00") res4: java.util.Date = Tue Mar 21 20:00:00 KST 2017 scala> new java.text.SimpleDateFormat("-MM-dd'T'HH:mm:ss.SSSXXX").parse("2017-03-21T00:00:00.000Z") res10: java.util.Date = Tue Mar 21 09:00:00 KST 2017 scala> new java.text.SimpleDateFormat("-MM-dd'T'HH:mm:ss.SSSZZ").parse("2017-03-21T00:00:00.000-11:00") java.text.ParseException: Unparseable date: "2017-03-21T00:00:00.000-11:00" at java.text.DateFormat.parse(DateFormat.java:366) ... 48 elided scala> new java.text.SimpleDateFormat("-MM-dd'T'HH:mm:ss.SSSZZ").parse("2017-03-21T00:00:00.000Z") java.text.ParseException: Unparseable date: "2017-03-21T00:00:00.000Z" at java.text.DateFormat.parse(DateFormat.java:366) ... 48 elided ``` ```scala scala> org.apache.commons.lang3.time.FastDateFormat.getInstance("-MM-dd'T'HH:mm:ss.SSSXXX").parse("2017-03-21T00:00:00.000-11:00") res7: java.util.Date = Tue Mar 21 20:00:00 KST 2017 scala> org.apache.commons.lang3.time.FastDateFormat.getInstance("-MM-dd'T'HH:mm:ss.SSSXXX").parse("2017-03-21T00:00:00.000Z") res1: java.util.Date = Tue Mar 21 09:00:00 KST 2017 scala> org.apache.commons.lang3.time.FastDateFormat.getInstance("-MM-dd'T'HH:mm:ss.SSSZZ").parse("2017-03-21T00:00:00.000-11:00") res8: java.util.Date = Tue Mar 21 20:00:00 KST 2017 scala> org.apache.commons.lang3.time.FastDateFormat.getInstance("-MM-dd'T'HH:mm:ss.SSSZZ").parse("2017-03-21T00:00:00.000Z") res2: java.util.Date = Tue Mar 21 09:00:00 KST 2017 ``` **Format** ```scala scala> new java.text.SimpleDateFormat("-MM-dd'T'HH:mm:ss.SSSXXX").format(new java.text.SimpleDateFormat("-MM-dd'T'HH:mm:ss.SSSXXX").parse("2017-03-21T00:00:00.000-11:00")) res6: String = 2017-03-21T20:00:00.000+09:00 ``` ```scala scala> val fd = org.apache.commons.lang3.time.FastDateFormat.getInstance("-MM-dd'T'HH:mm:ss.SSSZZ") fd: org.apache.commons.lang3.time.FastDateFormat = FastDateFormat[-MM-dd'T'HH:mm:ss.SSSZZ,ko_KR,Asia/Seoul] scala> fd.format(fd.parse("2017-03-21T00:00:00.000-11:00")) res1: String = 2017-03-21T20:00:00.000+09:00 scala> val fd = org.apache.commons.lang3.time.FastDateFormat.getInstance("-MM-dd'T'HH:mm:ss.SSSXXX") fd: org.apache.commons.lang3.time.FastDateFormat = FastDateFormat[-MM-dd'T'HH:mm:ss.SSSXXX,ko_KR,Asia/Seoul] scala> fd.format(fd.parse("2017-03-21T00:00:00.000-11:00")) res2: String = 2017-03-21T20:00:00.000+09:00 ``` Author: hyukjinkwon Closes #17489 from HyukjinKwon/SPARK-20166. Project: http://git-wip-us.apache.org/repos/asf/spark/repo C
spark git commit: [MINOR][DOCS] Replace non-breaking space to normal spaces that breaks rendering markdown
Repository: spark Updated Branches: refs/heads/master cff11fd20 -> 364b0db75 [MINOR][DOCS] Replace non-breaking space to normal spaces that breaks rendering markdown # What changes were proposed in this pull request? It seems there are several non-breaking spaces were inserted into several `.md`s and they look breaking rendering markdown files. These are different. For example, this can be checked via `python` as below: ```python >>> " " '\xc2\xa0' >>> " " ' ' ``` _Note that it seems this PR description automatically replaces non-breaking spaces into normal spaces. Please open a `vi` and copy and paste it into `python` to verify this (do not copy the characters here)._ I checked the output below in Sapari and Chrome on Mac OS and, Internal Explorer on Windows 10. **Before** ![2017-04-03 12 37 17](https://cloud.githubusercontent.com/assets/6477701/24594655/50aaba02-186a-11e7-80bb-d34b17a3398a.png) ![2017-04-03 12 36 57](https://cloud.githubusercontent.com/assets/6477701/24594654/50a855e6-186a-11e7-94e2-661e56544b0f.png) **After** ![2017-04-03 12 36 46](https://cloud.githubusercontent.com/assets/6477701/24594657/53c2545c-186a-11e7-9a73-00529afbfd75.png) ![2017-04-03 12 36 31](https://cloud.githubusercontent.com/assets/6477701/24594658/53c286c0-186a-11e7-99c9-e66b1f510fe7.png) ## How was this patch tested? Manually checking. These instances were found via ``` grep --include=*.scala --include=*.python --include=*.java --include=*.r --include=*.R --include=*.md --include=*.r -r -I " " . ``` in Mac OS. It seems there are several instances more as below: ``` ./docs/sql-programming-guide.md:â  âââ ... ./docs/sql-programming-guide.md:â  â ./docs/sql-programming-guide.md:â  âââ country=US ./docs/sql-programming-guide.md:â  â  âââ data.parquet ./docs/sql-programming-guide.md:â  âââ country=CN ./docs/sql-programming-guide.md:â  â  âââ data.parquet ./docs/sql-programming-guide.md:â  âââ ... ./docs/sql-programming-guide.md:   âââ ... ./docs/sql-programming-guide.md:   â ./docs/sql-programming-guide.md:   âââ country=US ./docs/sql-programming-guide.md:   â  âââ data.parquet ./docs/sql-programming-guide.md:   âââ country=CN ./docs/sql-programming-guide.md:   â  âââ data.parquet ./docs/sql-programming-guide.md:   âââ ... ./sql/core/src/test/README.md:â  âââ *.avdl # Testing Avro IDL(s) ./sql/core/src/test/README.md:â  âââ *.avpr # !! NO TOUCH !! Protocol files generated from Avro IDL(s) ./sql/core/src/test/README.md:â  âââ gen-avro.sh # Script used to generate Java code for Avro ./sql/core/src/test/README.md:â  âââ gen-thrift.sh # Script used to generate Java code for Thrift ``` These seems generated via `tree` command which inserts non-breaking spaces. They do not look causing any problem for rendering within code blocks and I did not fix it to reduce the overhead to manually replace it when it is overwritten via `tree` command in the future. Author: hyukjinkwon Closes #17517 from HyukjinKwon/non-breaking-space. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/364b0db7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/364b0db7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/364b0db7 Branch: refs/heads/master Commit: 364b0db75308ddd346b4ab1e032680e8eb4c1753 Parents: cff11fd Author: hyukjinkwon Authored: Mon Apr 3 10:09:11 2017 +0100 Committer: Sean Owen Committed: Mon Apr 3 10:09:11 2017 +0100 -- README.md | 2 +- docs/building-spark.md | 2 +- docs/monitoring.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/364b0db7/README.md -- diff --git a/README.md b/README.md index d0eca1d..1e521a7 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ building for particular Hive and Hive Thriftserver distributions. Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html) in the online documentation for an overview on how to configure Spark. -## Contributing +## Contributing Please review the [Contribution to Spark guide](http://spark.apache.org/contributing.html) for information on how to get started contributing to the project. http://git-wip-us.apache.org/repos/asf/spark/blob/364b0db7/docs/building-spark.md -- diff --git a/docs/building-
spark git commit: [MINOR][DOCS] Replace non-breaking space to normal spaces that breaks rendering markdown
Repository: spark Updated Branches: refs/heads/branch-2.1 ca144106b -> 77700ea38 [MINOR][DOCS] Replace non-breaking space to normal spaces that breaks rendering markdown # What changes were proposed in this pull request? It seems there are several non-breaking spaces were inserted into several `.md`s and they look breaking rendering markdown files. These are different. For example, this can be checked via `python` as below: ```python >>> " " '\xc2\xa0' >>> " " ' ' ``` _Note that it seems this PR description automatically replaces non-breaking spaces into normal spaces. Please open a `vi` and copy and paste it into `python` to verify this (do not copy the characters here)._ I checked the output below in Sapari and Chrome on Mac OS and, Internal Explorer on Windows 10. **Before** ![2017-04-03 12 37 17](https://cloud.githubusercontent.com/assets/6477701/24594655/50aaba02-186a-11e7-80bb-d34b17a3398a.png) ![2017-04-03 12 36 57](https://cloud.githubusercontent.com/assets/6477701/24594654/50a855e6-186a-11e7-94e2-661e56544b0f.png) **After** ![2017-04-03 12 36 46](https://cloud.githubusercontent.com/assets/6477701/24594657/53c2545c-186a-11e7-9a73-00529afbfd75.png) ![2017-04-03 12 36 31](https://cloud.githubusercontent.com/assets/6477701/24594658/53c286c0-186a-11e7-99c9-e66b1f510fe7.png) ## How was this patch tested? Manually checking. These instances were found via ``` grep --include=*.scala --include=*.python --include=*.java --include=*.r --include=*.R --include=*.md --include=*.r -r -I " " . ``` in Mac OS. It seems there are several instances more as below: ``` ./docs/sql-programming-guide.md:â  âââ ... ./docs/sql-programming-guide.md:â  â ./docs/sql-programming-guide.md:â  âââ country=US ./docs/sql-programming-guide.md:â  â  âââ data.parquet ./docs/sql-programming-guide.md:â  âââ country=CN ./docs/sql-programming-guide.md:â  â  âââ data.parquet ./docs/sql-programming-guide.md:â  âââ ... ./docs/sql-programming-guide.md:   âââ ... ./docs/sql-programming-guide.md:   â ./docs/sql-programming-guide.md:   âââ country=US ./docs/sql-programming-guide.md:   â  âââ data.parquet ./docs/sql-programming-guide.md:   âââ country=CN ./docs/sql-programming-guide.md:   â  âââ data.parquet ./docs/sql-programming-guide.md:   âââ ... ./sql/core/src/test/README.md:â  âââ *.avdl # Testing Avro IDL(s) ./sql/core/src/test/README.md:â  âââ *.avpr # !! NO TOUCH !! Protocol files generated from Avro IDL(s) ./sql/core/src/test/README.md:â  âââ gen-avro.sh # Script used to generate Java code for Avro ./sql/core/src/test/README.md:â  âââ gen-thrift.sh # Script used to generate Java code for Thrift ``` These seems generated via `tree` command which inserts non-breaking spaces. They do not look causing any problem for rendering within code blocks and I did not fix it to reduce the overhead to manually replace it when it is overwritten via `tree` command in the future. Author: hyukjinkwon Closes #17517 from HyukjinKwon/non-breaking-space. (cherry picked from commit 364b0db75308ddd346b4ab1e032680e8eb4c1753) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/77700ea3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/77700ea3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/77700ea3 Branch: refs/heads/branch-2.1 Commit: 77700ea38540b8326c37623abeebabf3d2497418 Parents: ca14410 Author: hyukjinkwon Authored: Mon Apr 3 10:09:11 2017 +0100 Committer: Sean Owen Committed: Mon Apr 3 10:09:21 2017 +0100 -- README.md | 2 +- docs/building-spark.md | 2 +- docs/monitoring.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/77700ea3/README.md -- diff --git a/README.md b/README.md index f598323..d861e9f 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ building for particular Hive and Hive Thriftserver distributions. Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html) in the online documentation for an overview on how to configure Spark. -## Contributing +## Contributing Please review the [Contribution to Spark guide](http://spark.apache.org/contributing.html) for information on how to get started contributing to the project. http://git-wip-us.apache.org/repos/asf/spark/blob/77700ea3/docs/building-spa
spark git commit: [SPARK-9002][CORE] KryoSerializer initialization does not include 'Array[Int]'
Repository: spark Updated Branches: refs/heads/master 364b0db75 -> fb5869f2c [SPARK-9002][CORE] KryoSerializer initialization does not include 'Array[Int]' [SPARK-9002][CORE] KryoSerializer initialization does not include 'Array[Int]' ## What changes were proposed in this pull request? Array[Int] has been registered in KryoSerializer. The following file has been changed core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala ## How was this patch tested? First, the issue was reproduced by new unit test. Then, the issue was fixed to pass the failed test. Author: Denis Bolshakov Closes #17482 from dbolshak/SPARK-9002. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb5869f2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb5869f2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb5869f2 Branch: refs/heads/master Commit: fb5869f2cf94217b3e254e2d0820507dc83a25cc Parents: 364b0db Author: Denis Bolshakov Authored: Mon Apr 3 10:16:07 2017 +0100 Committer: Sean Owen Committed: Mon Apr 3 10:16:07 2017 +0100 -- .../org/apache/spark/serializer/KryoSerializer.scala | 7 +++ .../org/apache/spark/serializer/KryoSerializerSuite.scala | 10 ++ 2 files changed, 17 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb5869f2/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala -- diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 0381563..6fc66e2 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -384,9 +384,16 @@ private[serializer] object KryoSerializer { classOf[HighlyCompressedMapStatus], classOf[CompactBuffer[_]], classOf[BlockManagerId], +classOf[Array[Boolean]], classOf[Array[Byte]], classOf[Array[Short]], +classOf[Array[Int]], classOf[Array[Long]], +classOf[Array[Float]], +classOf[Array[Double]], +classOf[Array[Char]], +classOf[Array[String]], +classOf[Array[Array[String]]], classOf[BoundedPriorityQueue[_]], classOf[SparkConf] ) http://git-wip-us.apache.org/repos/asf/spark/blob/fb5869f2/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala index a30653b..7c3922e 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala @@ -76,6 +76,9 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { } test("basic types") { +val conf = new SparkConf(false) +conf.set("spark.kryo.registrationRequired", "true") + val ser = new KryoSerializer(conf).newInstance() def check[T: ClassTag](t: T) { assert(ser.deserialize[T](ser.serialize(t)) === t) @@ -106,6 +109,9 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { } test("pairs") { +val conf = new SparkConf(false) +conf.set("spark.kryo.registrationRequired", "true") + val ser = new KryoSerializer(conf).newInstance() def check[T: ClassTag](t: T) { assert(ser.deserialize[T](ser.serialize(t)) === t) @@ -130,12 +136,16 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { } test("Scala data structures") { +val conf = new SparkConf(false) +conf.set("spark.kryo.registrationRequired", "true") + val ser = new KryoSerializer(conf).newInstance() def check[T: ClassTag](t: T) { assert(ser.deserialize[T](ser.serialize(t)) === t) } check(List[Int]()) check(List[Int](1, 2, 3)) +check(Seq[Int](1, 2, 3)) check(List[String]()) check(List[String]("x", "y", "z")) check(None) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20190][APP-ID] applications//jobs' in rest api,status should be [running|s…
Repository: spark Updated Branches: refs/heads/master b34f7665d -> c95fbea68 [SPARK-20190][APP-ID] applications//jobs' in rest api,status should be [running|s⦠â¦ucceeded|failed|unknown] ## What changes were proposed in this pull request? '/applications/[app-id]/jobs' in rest api.status should be'[running|succeeded|failed|unknown]'. now status is '[complete|succeeded|failed]'. but '/applications/[app-id]/jobs?status=complete' the server return 'HTTP ERROR 404'. Added '?status=running' and '?status=unknown'. code ï¼ public enum JobExecutionStatus { RUNNING, SUCCEEDED, FAILED, UNKNOWN; ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: guoxiaolongzte Closes #17507 from guoxiaolongzte/SPARK-20190. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c95fbea6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c95fbea6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c95fbea6 Branch: refs/heads/master Commit: c95fbea68e9dfb2c96a1d13dde17d80a37066ae6 Parents: b34f766 Author: guoxiaolongzte Authored: Tue Apr 4 09:56:17 2017 +0100 Committer: Sean Owen Committed: Tue Apr 4 09:56:17 2017 +0100 -- docs/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c95fbea6/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index 6cbc666..4d0617d 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -289,7 +289,7 @@ can be identified by their `[attempt-id]`. In the API listed below, when running /applications/[app-id]/jobs A list of all jobs for a given application. - ?status=[complete|succeeded|failed] list only jobs in the specific state. + ?status=[running|succeeded|failed|unknown] list only jobs in the specific state. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20190][APP-ID] applications//jobs' in rest api,status should be [running|s…
Repository: spark Updated Branches: refs/heads/branch-2.1 77700ea38 -> f9546dacb [SPARK-20190][APP-ID] applications//jobs' in rest api,status should be [running|s⦠â¦ucceeded|failed|unknown] ## What changes were proposed in this pull request? '/applications/[app-id]/jobs' in rest api.status should be'[running|succeeded|failed|unknown]'. now status is '[complete|succeeded|failed]'. but '/applications/[app-id]/jobs?status=complete' the server return 'HTTP ERROR 404'. Added '?status=running' and '?status=unknown'. code ï¼ public enum JobExecutionStatus { RUNNING, SUCCEEDED, FAILED, UNKNOWN; ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: guoxiaolongzte Closes #17507 from guoxiaolongzte/SPARK-20190. (cherry picked from commit c95fbea68e9dfb2c96a1d13dde17d80a37066ae6) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f9546dac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f9546dac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f9546dac Branch: refs/heads/branch-2.1 Commit: f9546dacb6c7d25b93d952aa421a80acc6532c11 Parents: 77700ea Author: guoxiaolongzte Authored: Tue Apr 4 09:56:17 2017 +0100 Committer: Sean Owen Committed: Tue Apr 4 09:56:26 2017 +0100 -- docs/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f9546dac/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index 5c8539d..be59350 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -278,7 +278,7 @@ can be identified by their `[attempt-id]`. In the API listed below, when running /applications/[app-id]/jobs A list of all jobs for a given application. - ?status=[complete|succeeded|failed] list only jobs in the specific state. + ?status=[running|succeeded|failed|unknown] list only jobs in the specific state. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20042][WEB UI] Fix log page buttons for reverse proxy mode
Repository: spark Updated Branches: refs/heads/master dad499f32 -> 6f09dc70d [SPARK-20042][WEB UI] Fix log page buttons for reverse proxy mode with spark.ui.reverseProxy=true, full path URLs like /log will point to the master web endpoint which is serving the worker UI as reverse proxy. To access a REST endpoint in the worker in reverse proxy mode , the leading /proxy/"target"/ part of the base URI must be retained. Added logic to log-view.js to handle this, similar to executorspage.js Patch was tested manually Author: Oliver Köth Closes #17370 from okoethibm/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f09dc70 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f09dc70 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f09dc70 Branch: refs/heads/master Commit: 6f09dc70d9808cae004ceda9ad615aa9be50f43d Parents: dad499f Author: Oliver Köth Authored: Wed Apr 5 08:09:42 2017 +0100 Committer: Sean Owen Committed: Wed Apr 5 08:09:42 2017 +0100 -- .../org/apache/spark/ui/static/log-view.js | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f09dc70/core/src/main/resources/org/apache/spark/ui/static/log-view.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/log-view.js b/core/src/main/resources/org/apache/spark/ui/static/log-view.js index 1782b4f..b5c43e5 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/log-view.js +++ b/core/src/main/resources/org/apache/spark/ui/static/log-view.js @@ -51,13 +51,26 @@ function noNewAlert() { window.setTimeout(function () {alert.css("display", "none");}, 4000); } + +function getRESTEndPoint() { + // If the worker is served from the master through a proxy (see doc on spark.ui.reverseProxy), + // we need to retain the leading ../proxy// part of the URL when making REST requests. + // Similar logic is contained in executorspage.js function createRESTEndPoint. + var words = document.baseURI.split('/'); + var ind = words.indexOf("proxy"); + if (ind > 0) { + return words.slice(0, ind + 2).join('/') + "/log"; + } + return "/log" +} + function loadMore() { var offset = Math.max(startByte - byteLength, 0); var moreByteLength = Math.min(byteLength, startByte); $.ajax({ type: "GET", -url: "/log" + baseParams + "&offset=" + offset + "&byteLength=" + moreByteLength, +url: getRESTEndPoint() + baseParams + "&offset=" + offset + "&byteLength=" + moreByteLength, success: function (data) { var oldHeight = $(".log-content")[0].scrollHeight; var newlineIndex = data.indexOf('\n'); @@ -83,14 +96,14 @@ function loadMore() { function loadNew() { $.ajax({ type: "GET", -url: "/log" + baseParams + "&byteLength=0", +url: getRESTEndPoint() + baseParams + "&byteLength=0", success: function (data) { var dataInfo = data.substring(0, data.indexOf('\n')).match(/\d+/g); var newDataLen = dataInfo[2] - totalLogLength; if (newDataLen != 0) { $.ajax({ type: "GET", - url: "/log" + baseParams + "&byteLength=" + newDataLen, + url: getRESTEndPoint() + baseParams + "&byteLength=" + newDataLen, success: function (data) { var newlineIndex = data.indexOf('\n'); var dataInfo = data.substring(0, newlineIndex).match(/\d+/g); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20042][WEB UI] Fix log page buttons for reverse proxy mode
Repository: spark Updated Branches: refs/heads/branch-2.1 00c124884 -> efc72dcc3 [SPARK-20042][WEB UI] Fix log page buttons for reverse proxy mode with spark.ui.reverseProxy=true, full path URLs like /log will point to the master web endpoint which is serving the worker UI as reverse proxy. To access a REST endpoint in the worker in reverse proxy mode , the leading /proxy/"target"/ part of the base URI must be retained. Added logic to log-view.js to handle this, similar to executorspage.js Patch was tested manually Author: Oliver Köth Closes #17370 from okoethibm/master. (cherry picked from commit 6f09dc70d9808cae004ceda9ad615aa9be50f43d) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/efc72dcc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/efc72dcc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/efc72dcc Branch: refs/heads/branch-2.1 Commit: efc72dcc3f964ea9931fb47a454db253556d0f8a Parents: 00c1248 Author: Oliver Köth Authored: Wed Apr 5 08:09:42 2017 +0100 Committer: Sean Owen Committed: Wed Apr 5 08:09:52 2017 +0100 -- .../org/apache/spark/ui/static/log-view.js | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/efc72dcc/core/src/main/resources/org/apache/spark/ui/static/log-view.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/log-view.js b/core/src/main/resources/org/apache/spark/ui/static/log-view.js index 1782b4f..b5c43e5 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/log-view.js +++ b/core/src/main/resources/org/apache/spark/ui/static/log-view.js @@ -51,13 +51,26 @@ function noNewAlert() { window.setTimeout(function () {alert.css("display", "none");}, 4000); } + +function getRESTEndPoint() { + // If the worker is served from the master through a proxy (see doc on spark.ui.reverseProxy), + // we need to retain the leading ../proxy// part of the URL when making REST requests. + // Similar logic is contained in executorspage.js function createRESTEndPoint. + var words = document.baseURI.split('/'); + var ind = words.indexOf("proxy"); + if (ind > 0) { + return words.slice(0, ind + 2).join('/') + "/log"; + } + return "/log" +} + function loadMore() { var offset = Math.max(startByte - byteLength, 0); var moreByteLength = Math.min(byteLength, startByte); $.ajax({ type: "GET", -url: "/log" + baseParams + "&offset=" + offset + "&byteLength=" + moreByteLength, +url: getRESTEndPoint() + baseParams + "&offset=" + offset + "&byteLength=" + moreByteLength, success: function (data) { var oldHeight = $(".log-content")[0].scrollHeight; var newlineIndex = data.indexOf('\n'); @@ -83,14 +96,14 @@ function loadMore() { function loadNew() { $.ajax({ type: "GET", -url: "/log" + baseParams + "&byteLength=0", +url: getRESTEndPoint() + baseParams + "&byteLength=0", success: function (data) { var dataInfo = data.substring(0, data.indexOf('\n')).match(/\d+/g); var newDataLen = dataInfo[2] - totalLogLength; if (newDataLen != 0) { $.ajax({ type: "GET", - url: "/log" + baseParams + "&byteLength=" + newDataLen, + url: getRESTEndPoint() + baseParams + "&byteLength=" + newDataLen, success: function (data) { var newlineIndex = data.indexOf('\n'); var dataInfo = data.substring(0, newlineIndex).match(/\d+/g); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19807][WEB UI] Add reason for cancellation when a stage is killed using web UI
Repository: spark Updated Branches: refs/heads/master 6f09dc70d -> 71c3c4815 [SPARK-19807][WEB UI] Add reason for cancellation when a stage is killed using web UI ## What changes were proposed in this pull request? When a user kills a stage using web UI (in Stages page), StagesTab.handleKillRequest requests SparkContext to cancel the stage without giving a reason. SparkContext has cancelStage(stageId: Int, reason: String) that Spark could use to pass the information for monitoring/debugging purposes. ## How was this patch tested? manual tests Please review http://spark.apache.org/contributing.html before opening a pull request. Author: shaolinliu Author: lvdongr Closes #17258 from shaolinliu/SPARK-19807. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71c3c481 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71c3c481 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71c3c481 Branch: refs/heads/master Commit: 71c3c48159fe7eb4a46fc2a1b78b72088ccfa824 Parents: 6f09dc7 Author: shaolinliu Authored: Wed Apr 5 13:47:44 2017 +0100 Committer: Sean Owen Committed: Wed Apr 5 13:47:44 2017 +0100 -- core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71c3c481/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala index c1f2511..181465b 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala @@ -42,7 +42,7 @@ private[ui] class StagesTab(parent: SparkUI) extends SparkUITab(parent, "stages" val stageId = Option(request.getParameter("id")).map(_.toInt) stageId.foreach { id => if (progressListener.activeStages.contains(id)) { - sc.foreach(_.cancelStage(id)) + sc.foreach(_.cancelStage(id, "killed via the Web UI")) // Do a quick pause here to give Spark time to kill the stage so it shows up as // killed after the refresh. Note that this will block the serving thread so the // time should be limited in duration. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11242][SQL] In conf/spark-env.sh.template SPARK_DRIVER_MEMORY is documented incorrectly
Repository: spark Updated Branches: refs/heads/master d4950e6be -> 188ea348f [SPARK-11242][SQL] In conf/spark-env.sh.template SPARK_DRIVER_MEMORY is documented incorrectly Minor fix on the comment Author: guoxi Closes #9201 from xguo27/SPARK-11242. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/188ea348 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/188ea348 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/188ea348 Branch: refs/heads/master Commit: 188ea348fdcf877d86f3c433cd15f6468fe3b42a Parents: d4950e6 Author: guoxi Authored: Thu Oct 22 13:56:18 2015 -0700 Committer: Sean Owen Committed: Thu Oct 22 13:56:18 2015 -0700 -- conf/spark-env.sh.template | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/188ea348/conf/spark-env.sh.template -- diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index 990ded4..771251f 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -36,10 +36,10 @@ # Options read in YARN client mode # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files -# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) -# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). -# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) -# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2) +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) # - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) # - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: âdefaultâ) # - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11245] update twitter4j to 4.0.4 version
Repository: spark Updated Branches: refs/heads/master ffed00493 -> e5bc8c275 [SPARK-11245] update twitter4j to 4.0.4 version update twitter4j to 4.0.4 version https://issues.apache.org/jira/browse/SPARK-11245 Author: dima Closes #9221 from pronix/twitter4j_update. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e5bc8c27 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e5bc8c27 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e5bc8c27 Branch: refs/heads/master Commit: e5bc8c27577f96c1ae5dc8cf9bf41cbe2877ffe3 Parents: ffed004 Author: dima Authored: Sat Oct 24 18:16:45 2015 +0100 Committer: Sean Owen Committed: Sat Oct 24 18:16:45 2015 +0100 -- external/twitter/pom.xml | 2 +- .../org/apache/spark/streaming/twitter/TwitterInputDStream.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e5bc8c27/external/twitter/pom.xml -- diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 4c22ec8..087270d 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -51,7 +51,7 @@ org.twitter4j twitter4j-stream - 3.0.3 + 4.0.4 org.scalacheck http://git-wip-us.apache.org/repos/asf/spark/blob/e5bc8c27/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala -- diff --git a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala index d7de74b..9a85a65 100644 --- a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala +++ b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala @@ -87,7 +87,7 @@ class TwitterReceiver( val query = new FilterQuery if (filters.size > 0) { -query.track(filters.toArray) +query.track(filters.mkString(",")) newTwitterStream.filter(query) } else { newTwitterStream.sample() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11264] bin/spark-class can't find assembly jars with certain GREP_OPTIONS set
Repository: spark Updated Branches: refs/heads/branch-1.5 56f0bb6ed -> 1cd2d9ced [SPARK-11264] bin/spark-class can't find assembly jars with certain GREP_OPTIONS set Temporarily remove GREP_OPTIONS if set in bin/spark-class. Some GREP_OPTIONS will modify the output of the grep commands that are looking for the assembly jars. For example, if the -n option is specified, the grep output will look like: 5:spark-assembly-1.5.1-hadoop2.4.0.jar This will not match the regular expressions, and so the jar files will not be found. We could improve the regular expression to handle this case and trim off extra characters, but it is difficult to know which options may or may not be set. Unsetting GREP_OPTIONS within the script handles all the cases and gives the desired output. Author: Jeffrey Naisbitt Closes #9231 from naisbitt/unset-GREP_OPTIONS. (cherry picked from commit 28132ceb10d0c127495ce8cb36135e1cb54164d7) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1cd2d9ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1cd2d9ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1cd2d9ce Branch: refs/heads/branch-1.5 Commit: 1cd2d9ced55ab52899a4e57b5c3da6b444ec9ae4 Parents: 56f0bb6 Author: Jeffrey Naisbitt Authored: Sat Oct 24 18:21:36 2015 +0100 Committer: Sean Owen Committed: Sat Oct 24 18:21:47 2015 +0100 -- bin/spark-class | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1cd2d9ce/bin/spark-class -- diff --git a/bin/spark-class b/bin/spark-class index 2b59e5d..7db8cb5 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -42,6 +42,7 @@ else ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION" fi +GREP_OPTIONS= num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)" if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" ]; then echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11264] bin/spark-class can't find assembly jars with certain GREP_OPTIONS set
Repository: spark Updated Branches: refs/heads/master e5bc8c275 -> 28132ceb1 [SPARK-11264] bin/spark-class can't find assembly jars with certain GREP_OPTIONS set Temporarily remove GREP_OPTIONS if set in bin/spark-class. Some GREP_OPTIONS will modify the output of the grep commands that are looking for the assembly jars. For example, if the -n option is specified, the grep output will look like: 5:spark-assembly-1.5.1-hadoop2.4.0.jar This will not match the regular expressions, and so the jar files will not be found. We could improve the regular expression to handle this case and trim off extra characters, but it is difficult to know which options may or may not be set. Unsetting GREP_OPTIONS within the script handles all the cases and gives the desired output. Author: Jeffrey Naisbitt Closes #9231 from naisbitt/unset-GREP_OPTIONS. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28132ceb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28132ceb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28132ceb Branch: refs/heads/master Commit: 28132ceb10d0c127495ce8cb36135e1cb54164d7 Parents: e5bc8c2 Author: Jeffrey Naisbitt Authored: Sat Oct 24 18:21:36 2015 +0100 Committer: Sean Owen Committed: Sat Oct 24 18:21:36 2015 +0100 -- bin/spark-class | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/28132ceb/bin/spark-class -- diff --git a/bin/spark-class b/bin/spark-class index e38e08d..8cae6cc 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -42,6 +42,7 @@ else ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION" fi +GREP_OPTIONS= num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)" if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" -a "$SPARK_PREPEND_CLASSES" != "1" ]; then echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Fix typos
Repository: spark Updated Branches: refs/heads/master 28132ceb1 -> 146da0d81 Fix typos Two typos squashed. BTW Let me know how to proceed with other typos if I ran across any. I don't feel well to leave them aside as much as sending pull requests with such tiny changes. Guide me. Author: Jacek Laskowski Closes #9250 from jaceklaskowski/typos-hunting. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/146da0d8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/146da0d8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/146da0d8 Branch: refs/heads/master Commit: 146da0d8100490a6e49a6c076ec253cdaf9f8905 Parents: 28132ce Author: Jacek Laskowski Authored: Sun Oct 25 01:33:22 2015 +0100 Committer: Sean Owen Committed: Sun Oct 25 01:33:22 2015 +0100 -- core/src/main/scala/org/apache/spark/SparkConf.scala | 2 +- core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala | 2 +- .../src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala | 3 ++- core/src/main/scala/org/apache/spark/util/ThreadUtils.scala | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/146da0d8/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 58d3b84..f023e4b 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -621,7 +621,7 @@ private[spark] object SparkConf extends Logging { /** * Return whether the given config should be passed to an executor on start-up. * - * Certain akka and authentication configs are required of the executor when it connects to + * Certain akka and authentication configs are required from the executor when it connects to * the scheduler, while the rest of the spark configs can be inherited from the driver later. */ def isExecutorStartupConf(name: String): Boolean = { http://git-wip-us.apache.org/repos/asf/spark/blob/146da0d8/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala -- diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala index 48afe3a..fdf76d3 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala @@ -197,7 +197,7 @@ private[spark] class MetricsSystem private ( } } catch { case e: Exception => { -logError("Sink class " + classPath + " cannot be instantialized") +logError("Sink class " + classPath + " cannot be instantiated") throw e } } http://git-wip-us.apache.org/repos/asf/spark/blob/146da0d8/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala index f25f3ed..cb9a300 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala @@ -22,7 +22,8 @@ import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId /** - * Low-level task scheduler interface, currently implemented exclusively by TaskSchedulerImpl. + * Low-level task scheduler interface, currently implemented exclusively by + * [[org.apache.spark.scheduler.TaskSchedulerImpl]]. * This interface allows plugging in different task schedulers. Each TaskScheduler schedules tasks * for a single SparkContext. These schedulers get sets of tasks submitted to them from the * DAGScheduler for each stage, and are responsible for sending the tasks to the cluster, running http://git-wip-us.apache.org/repos/asf/spark/blob/146da0d8/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 15e7519..5328344 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -80,7 +80,7 @@ private[spark] object ThreadUtils { } /** - * Wrapper over newSingleThreadScheduledExecutor. + * Wrapper over ScheduledThreadPoolExecutor. */ def ne
spark git commit: Fix typos
Repository: spark Updated Branches: refs/heads/branch-1.5 1cd2d9ced -> 5200a6e29 Fix typos Two typos squashed. BTW Let me know how to proceed with other typos if I ran across any. I don't feel well to leave them aside as much as sending pull requests with such tiny changes. Guide me. Author: Jacek Laskowski Closes #9250 from jaceklaskowski/typos-hunting. (cherry picked from commit 146da0d8100490a6e49a6c076ec253cdaf9f8905) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5200a6e2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5200a6e2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5200a6e2 Branch: refs/heads/branch-1.5 Commit: 5200a6e2919bf9c0acc129b4db7c15546d157351 Parents: 1cd2d9c Author: Jacek Laskowski Authored: Sun Oct 25 01:33:22 2015 +0100 Committer: Sean Owen Committed: Sun Oct 25 01:33:37 2015 +0100 -- core/src/main/scala/org/apache/spark/SparkConf.scala | 2 +- core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala | 2 +- .../src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala | 3 ++- core/src/main/scala/org/apache/spark/util/ThreadUtils.scala | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5200a6e2/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index b344b5e..cad0ded 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -600,7 +600,7 @@ private[spark] object SparkConf extends Logging { /** * Return whether the given config should be passed to an executor on start-up. * - * Certain akka and authentication configs are required of the executor when it connects to + * Certain akka and authentication configs are required from the executor when it connects to * the scheduler, while the rest of the spark configs can be inherited from the driver later. */ def isExecutorStartupConf(name: String): Boolean = { http://git-wip-us.apache.org/repos/asf/spark/blob/5200a6e2/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala -- diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala index 4517f46..d972fa7 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala @@ -197,7 +197,7 @@ private[spark] class MetricsSystem private ( } } catch { case e: Exception => { -logError("Sink class " + classPath + " cannot be instantialized") +logError("Sink class " + classPath + " cannot be instantiated") throw e } } http://git-wip-us.apache.org/repos/asf/spark/blob/5200a6e2/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala index f25f3ed..cb9a300 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala @@ -22,7 +22,8 @@ import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId /** - * Low-level task scheduler interface, currently implemented exclusively by TaskSchedulerImpl. + * Low-level task scheduler interface, currently implemented exclusively by + * [[org.apache.spark.scheduler.TaskSchedulerImpl]]. * This interface allows plugging in different task schedulers. Each TaskScheduler schedules tasks * for a single SparkContext. These schedulers get sets of tasks submitted to them from the * DAGScheduler for each stage, and are responsible for sending the tasks to the cluster, running http://git-wip-us.apache.org/repos/asf/spark/blob/5200a6e2/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 22e291a..06976f8 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -81,7 +81,7 @@ private[spark] object ThreadUtils { } /** - * Wrapper
spark git commit: [SPARK-11287] Fixed class name to properly start TestExecutor from deploy.client.TestClient
Repository: spark Updated Branches: refs/heads/master 92b9c5edd -> 80279ac18 [SPARK-11287] Fixed class name to properly start TestExecutor from deploy.client.TestClient Executing deploy.client.TestClient fails due to bad class name for TestExecutor in ApplicationDescription. Author: Bryan Cutler Closes #9255 from BryanCutler/fix-TestClient-classname-SPARK-11287. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80279ac1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80279ac1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80279ac1 Branch: refs/heads/master Commit: 80279ac1875d488f7000f352a958a35536bd4c2e Parents: 92b9c5e Author: Bryan Cutler Authored: Sun Oct 25 19:05:45 2015 + Committer: Sean Owen Committed: Sun Oct 25 19:05:45 2015 + -- .../main/scala/org/apache/spark/deploy/client/TestClient.scala| 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80279ac1/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala index 1c79089..adb3f02 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala @@ -48,8 +48,9 @@ private[spark] object TestClient { val url = args(0) val conf = new SparkConf val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf)) +val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$") val desc = new ApplicationDescription("TestClient", Some(1), 512, - Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()), "ignored") + Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored") val listener = new TestListener val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf) client.start() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11287] Fixed class name to properly start TestExecutor from deploy.client.TestClient
Repository: spark Updated Branches: refs/heads/branch-1.5 36fddb07a -> 74921c219 [SPARK-11287] Fixed class name to properly start TestExecutor from deploy.client.TestClient Executing deploy.client.TestClient fails due to bad class name for TestExecutor in ApplicationDescription. Author: Bryan Cutler Closes #9255 from BryanCutler/fix-TestClient-classname-SPARK-11287. (cherry picked from commit 80279ac1875d488f7000f352a958a35536bd4c2e) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74921c21 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74921c21 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74921c21 Branch: refs/heads/branch-1.5 Commit: 74921c219ba1c8917a6b69f36a44c488d10804e4 Parents: 36fddb0 Author: Bryan Cutler Authored: Sun Oct 25 19:05:45 2015 + Committer: Sean Owen Committed: Sun Oct 25 19:05:55 2015 + -- .../main/scala/org/apache/spark/deploy/client/TestClient.scala| 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74921c21/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala index 1c79089..adb3f02 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala @@ -48,8 +48,9 @@ private[spark] object TestClient { val url = args(0) val conf = new SparkConf val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, new SecurityManager(conf)) +val executorClassname = TestExecutor.getClass.getCanonicalName.stripSuffix("$") val desc = new ApplicationDescription("TestClient", Some(1), 512, - Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()), "ignored") + Command(executorClassname, Seq(), Map(), Seq(), Seq(), Seq()), "ignored") val listener = new TestListener val client = new AppClient(rpcEnv, Array(url), desc, listener, new SparkConf) client.start() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5966][WIP] Spark-submit deploy-mode cluster is not compatible with master local>
Repository: spark Updated Branches: refs/heads/master 05c4bdb57 -> 616be29c7 [SPARK-5966][WIP] Spark-submit deploy-mode cluster is not compatible with master local> ⦠master local> Author: Kevin Yu Closes #9220 from kevinyu98/working_on_spark-5966. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/616be29c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/616be29c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/616be29c Branch: refs/heads/master Commit: 616be29c7f2ebc184bd5ec97210da36a2174d80c Parents: 05c4bdb Author: Kevin Yu Authored: Mon Oct 26 09:34:15 2015 + Committer: Sean Owen Committed: Mon Oct 26 09:35:19 2015 + -- core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/616be29c/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 640cc32..84ae122 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -328,6 +328,8 @@ object SparkSubmit { case (STANDALONE, CLUSTER) if args.isR => printErrorAndExit("Cluster deploy mode is currently not supported for R " + "applications on standalone clusters.") + case (LOCAL, CLUSTER) => +printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"") case (_, CLUSTER) if isShell(args.primaryResource) => printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.") case (_, CLUSTER) if isSqlShell(args.mainClass) => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5966][WIP] Spark-submit deploy-mode cluster is not compatible with master local>
Repository: spark Updated Branches: refs/heads/branch-1.5 74921c219 -> a355d0d0d [SPARK-5966][WIP] Spark-submit deploy-mode cluster is not compatible with master local> ⦠master local> Author: Kevin Yu Closes #9220 from kevinyu98/working_on_spark-5966. (cherry picked from commit 616be29c7f2ebc184bd5ec97210da36a2174d80c) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a355d0d0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a355d0d0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a355d0d0 Branch: refs/heads/branch-1.5 Commit: a355d0d0d1dad72ccdffcb41c29b50f4aa051a48 Parents: 74921c2 Author: Kevin Yu Authored: Mon Oct 26 09:34:15 2015 + Committer: Sean Owen Committed: Mon Oct 26 09:35:52 2015 + -- core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a355d0d0/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 86fcf94..fefbba9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -328,6 +328,8 @@ object SparkSubmit { case (STANDALONE, CLUSTER) if args.isR => printErrorAndExit("Cluster deploy mode is currently not supported for R " + "applications on standalone clusters.") + case (LOCAL, CLUSTER) => +printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"") case (_, CLUSTER) if isShell(args.primaryResource) => printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.") case (_, CLUSTER) if isSqlShell(args.mainClass) => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11276][CORE] SizeEstimator prevents class unloading
Repository: spark Updated Branches: refs/heads/master d77d198fc -> feb8d6a44 [SPARK-11276][CORE] SizeEstimator prevents class unloading The SizeEstimator keeps a cache of ClassInfos but this cache uses Class objects as keys. Which results in strong references to the Class objects. If these classes are dynamically created this prevents the corresponding ClassLoader from being GCed. Leading to PermGen exhaustion. We use a Map with WeakKeys to prevent this issue. Author: Sem Mulder Closes #9244 from SemMulder/fix-sizeestimator-classunloading. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/feb8d6a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/feb8d6a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/feb8d6a4 Branch: refs/heads/master Commit: feb8d6a44fbfc31a880c0cfcaadc91786073 Parents: d77d198 Author: Sem Mulder Authored: Tue Oct 27 07:55:10 2015 + Committer: Sean Owen Committed: Tue Oct 27 07:55:10 2015 + -- core/src/main/scala/org/apache/spark/util/SizeEstimator.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/feb8d6a4/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala index 14b1f2a..23ee4ef 100644 --- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala +++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala @@ -17,6 +17,8 @@ package org.apache.spark.util +import com.google.common.collect.MapMaker + import java.lang.management.ManagementFactory import java.lang.reflect.{Field, Modifier} import java.util.{IdentityHashMap, Random} @@ -29,7 +31,6 @@ import org.apache.spark.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.collection.OpenHashSet - /** * :: DeveloperApi :: * Estimates the sizes of Java objects (number of bytes of memory they occupy), for use in @@ -73,7 +74,8 @@ object SizeEstimator extends Logging { private val ALIGN_SIZE = 8 // A cache of ClassInfo objects for each class - private val classInfos = new ConcurrentHashMap[Class[_], ClassInfo] + // We use weakKeys to allow GC of dynamically created classes + private val classInfos = new MapMaker().weakKeys().makeMap[Class[_], ClassInfo]() // Object and pointer sizes are arch dependent private var is64bit = false - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11318] Include hive profile in make-distribution.sh command
Repository: spark Updated Branches: refs/heads/master f79ebf2a9 -> f304f9c9a [SPARK-11318] Include hive profile in make-distribution.sh command Author: tedyu Closes #9281 from tedyu/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f304f9c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f304f9c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f304f9c9 Branch: refs/heads/master Commit: f304f9c9a1c954b3b5786f90bb13f543637d3192 Parents: f79ebf2 Author: tedyu Authored: Thu Oct 29 15:02:13 2015 +0100 Committer: Sean Owen Committed: Thu Oct 29 15:02:13 2015 +0100 -- docs/building-spark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f304f9c9/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 743643c..4f73adb 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -38,7 +38,7 @@ To create a Spark distribution like those distributed by the to be runnable, use `make-distribution.sh` in the project root directory. It can be configured with Maven profile settings and so on like the direct Maven build. Example: -./make-distribution.sh --name custom-spark --tgz -Phadoop-2.4 -Pyarn +./make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn For more information on usage, run `./make-distribution.sh --help` - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11388][BUILD] Fix self closing tags.
Repository: spark Updated Branches: refs/heads/master f304f9c9a -> 3bb2a8d75 [SPARK-11388][BUILD] Fix self closing tags. Java 8 javadoc does not like self closing tags: ``, ``, ... This PR fixes those. Author: Herman van Hovell Closes #9339 from hvanhovell/SPARK-11388. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3bb2a8d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3bb2a8d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3bb2a8d7 Branch: refs/heads/master Commit: 3bb2a8d7508b507edfcc21bd20912b0ff4a0a248 Parents: f304f9c Author: Herman van Hovell Authored: Thu Oct 29 15:11:00 2015 +0100 Committer: Sean Owen Committed: Thu Oct 29 15:11:00 2015 +0100 -- .../main/java/org/apache/spark/launcher/SparkAppHandle.java | 4 ++-- .../main/java/org/apache/spark/launcher/SparkLauncher.java | 8 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3bb2a8d7/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java -- diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java index 2896a91..13dd9f1 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java @@ -19,7 +19,7 @@ package org.apache.spark.launcher; /** * A handle to a running Spark application. - * + * * Provides runtime information about the underlying Spark application, and actions to control it. * * @since 1.6.0 @@ -110,7 +110,7 @@ public interface SparkAppHandle { * Callback for changes in the handle's state. * * @param handle The updated handle. - * @see {@link SparkAppHandle#getState()} + * @see SparkAppHandle#getState() */ void stateChanged(SparkAppHandle handle); http://git-wip-us.apache.org/repos/asf/spark/blob/3bb2a8d7/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java -- diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java index 5d74b37..dd1c93a 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java @@ -350,7 +350,7 @@ public class SparkLauncher { /** * Launches a sub-process that will start the configured Spark application. - * + * * The {@link #startApplication(SparkAppHandle.Listener...)} method is preferred when launching * Spark, since it provides better control of the child application. * @@ -362,16 +362,16 @@ public class SparkLauncher { /** * Starts a Spark application. - * + * * This method returns a handle that provides information about the running application and can * be used to do basic interaction with it. - * + * * The returned handle assumes that the application will instantiate a single SparkContext * during its lifetime. Once that context reports a final state (one that indicates the * SparkContext has stopped), the handle will not perform new state transitions, so anything * that happens after that cannot be monitored. If the underlying application is launched as * a child process, {@link SparkAppHandle#kill()} can still be used to kill the child process. - * + * * Currently, all applications are launched as child processes. The child's stdout and stderr * are merged and written to a logger (see java.util.logging). The logger's name * can be defined by setting {@link #CHILD_PROCESS_LOGGER_NAME} in the app's configuration. If - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10986][MESOS] Set the context class loader in the Mesos executor backend.
Repository: spark Updated Branches: refs/heads/master 14d08b990 -> 0451b0014 [SPARK-10986][MESOS] Set the context class loader in the Mesos executor backend. See [SPARK-10986](https://issues.apache.org/jira/browse/SPARK-10986) for details. This fixes the `ClassNotFoundException` for Spark classes in the serializer. I am not sure this is the right way to handle the class loader, but I couldn't find any documentation on how the context class loader is used and who relies on it. It seems at least the serializer uses it to instantiate classes during deserialization. I am open to suggestions (I tried this fix on a real Mesos cluster and it *does* fix the issue). tnachen andrewor14 Author: Iulian Dragos Closes #9282 from dragos/issue/mesos-classloader. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0451b001 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0451b001 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0451b001 Branch: refs/heads/master Commit: 0451b00148a294c665146563242d2fe2de943a02 Parents: 14d08b9 Author: Iulian Dragos Authored: Fri Oct 30 16:51:32 2015 + Committer: Sean Owen Committed: Fri Oct 30 16:51:32 2015 + -- .../scala/org/apache/spark/executor/MesosExecutorBackend.scala | 5 + 1 file changed, 5 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0451b001/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index 0474fd2..c9f18eb 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -63,6 +63,11 @@ private[spark] class MesosExecutorBackend logInfo(s"Registered with Mesos as executor ID $executorId with $cpusPerTask cpus") this.driver = driver +// Set a context class loader to be picked up by the serializer. Without this call +// the serializer would default to the null class loader, and fail to find Spark classes +// See SPARK-10986. +Thread.currentThread().setContextClassLoader(this.getClass.getClassLoader) + val properties = Utils.deserialize[Array[(String, String)]](executorInfo.getData.toByteArray) ++ Seq[(String, String)](("spark.app.id", frameworkInfo.getId.getValue)) val conf = new SparkConf(loadDefaults = true).setAll(properties) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11342][TESTS] Allow to set hadoop profile when running dev/ru…
Repository: spark Updated Branches: refs/heads/master 40c77fb23 -> 729f983e6 [SPARK-11342][TESTS] Allow to set hadoop profile when running dev/ru⦠â¦n_tests Author: Jeff Zhang Closes #9295 from zjffdu/SPARK-11342. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/729f983e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/729f983e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/729f983e Branch: refs/heads/master Commit: 729f983e66cf65da2e8f48c463ccde2b355240c4 Parents: 40c77fb Author: Jeff Zhang Authored: Fri Oct 30 18:50:12 2015 + Committer: Sean Owen Committed: Fri Oct 30 18:50:12 2015 + -- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/729f983e/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index 6b4b710..9e1abb0 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -486,7 +486,7 @@ def main(): else: # else we're running locally and can use local settings build_tool = "sbt" -hadoop_version = "hadoop2.3" +hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11226][SQL] Empty line in json file should be skipped
Repository: spark Updated Branches: refs/heads/master 3c471885d -> 97b3c8fb4 [SPARK-11226][SQL] Empty line in json file should be skipped Currently the empty line in json file will be parsed into Row with all null field values. But in json, "{}" represents a json object, empty line is supposed to be skipped. Make a trivial change for this. Author: Jeff Zhang Closes #9211 from zjffdu/SPARK-11226. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/97b3c8fb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/97b3c8fb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/97b3c8fb Branch: refs/heads/master Commit: 97b3c8fb470f0d3c1cdb1aeb27f675e695442e87 Parents: 3c47188 Author: Jeff Zhang Authored: Sat Oct 31 11:10:37 2015 + Committer: Sean Owen Committed: Sat Oct 31 11:10:37 2015 + -- .../datasources/json/JacksonParser.scala| 46 +++- .../org/apache/spark/sql/SQLQuerySuite.scala| 11 + .../execution/datasources/json/JsonSuite.scala | 3 -- 3 files changed, 36 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/97b3c8fb/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index b2e5201..4f53eeb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -245,29 +245,33 @@ private[sql] object JacksonParser { val factory = new JsonFactory() iter.flatMap { record => -try { - Utils.tryWithResource(factory.createParser(record)) { parser => -parser.nextToken() - -convertField(factory, parser, schema) match { - case null => failedRecord(record) - case row: InternalRow => row :: Nil - case array: ArrayData => -if (array.numElements() == 0) { - Nil -} else { - array.toArray[InternalRow](schema) -} - case _ => -sys.error( - s"Failed to parse record $record. Please make sure that each line of the file " + -"(or each string in the RDD) is a valid JSON object or " + -"an array of JSON objects.") +if (record.trim.isEmpty) { + Nil +} else { + try { +Utils.tryWithResource(factory.createParser(record)) { parser => + parser.nextToken() + + convertField(factory, parser, schema) match { +case null => failedRecord(record) +case row: InternalRow => row :: Nil +case array: ArrayData => + if (array.numElements() == 0) { +Nil + } else { +array.toArray[InternalRow](schema) + } +case _ => + sys.error( +s"Failed to parse record $record. Please make sure that each line of " + + "the file (or each string in the RDD) is a valid JSON object or " + + "an array of JSON objects.") + } } + } catch { +case _: JsonProcessingException => + failedRecord(record) } -} catch { - case _: JsonProcessingException => -failedRecord(record) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/97b3c8fb/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 5a616fa..5413ef1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -225,6 +225,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { Seq(Row("1"), Row("2"))) } + test("SPARK-11226 Skip empty line in json file") { +sqlContext.read.json( + sparkContext.parallelize( +Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}", ""))) + .registerTempTable("d") + +checkAnswer( + sql("select count(1) from d"), + Seq(Row(3))) + } + test("SPARK-8828 sum should return null if all input va
spark git commit: [SPARK-11305][DOCS] Remove Third-Party Hadoop Distributions Doc Page
Repository: spark Updated Branches: refs/heads/master aa494a9c2 -> 643c49c75 [SPARK-11305][DOCS] Remove Third-Party Hadoop Distributions Doc Page Remove Hadoop third party distro page, and move Hadoop cluster config info to configuration page CC pwendell Author: Sean Owen Closes #9298 from srowen/SPARK-11305. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/643c49c7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/643c49c7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/643c49c7 Branch: refs/heads/master Commit: 643c49c75ee95243fd19ae73b5170e6e6e212b8d Parents: aa494a9 Author: Sean Owen Authored: Sun Nov 1 12:25:49 2015 + Committer: Sean Owen Committed: Sun Nov 1 12:25:49 2015 + -- README.md| 5 +- docs/_layouts/global.html| 1 - docs/configuration.md| 15 docs/hadoop-third-party-distributions.md | 117 -- docs/index.md| 1 - docs/programming-guide.md| 9 +- 6 files changed, 19 insertions(+), 129 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/643c49c7/README.md -- diff --git a/README.md b/README.md index 4116ef3..c0d6a94 100644 --- a/README.md +++ b/README.md @@ -87,10 +87,7 @@ Hadoop, you must build Spark against the same version that your cluster runs. Please refer to the build documentation at ["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version) for detailed guidance on building for a particular distribution of Hadoop, including -building for particular Hive and Hive Thriftserver distributions. See also -["Third Party Hadoop Distributions"](http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html) -for guidance on building a Spark application that works with a particular -distribution. +building for particular Hive and Hive Thriftserver distributions. ## Configuration http://git-wip-us.apache.org/repos/asf/spark/blob/643c49c7/docs/_layouts/global.html -- diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index b4952fe..467ff7a 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -112,7 +112,6 @@ Job Scheduling Security Hardware Provisioning -3rd-Party Hadoop Distros Building Spark https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark";>Contributing to Spark http://git-wip-us.apache.org/repos/asf/spark/blob/643c49c7/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 682384d..c276e8e 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1674,3 +1674,18 @@ Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can config To specify a different configuration directory other than the default "SPARK_HOME/conf", you can set SPARK_CONF_DIR. Spark will use the the configuration files (spark-defaults.conf, spark-env.sh, log4j.properties, etc) from this directory. + +# Inheriting Hadoop Cluster Configuration + +If you plan to read and write from HDFS using Spark, there are two Hadoop configuration files that +should be included on Spark's classpath: + +* `hdfs-site.xml`, which provides default behaviors for the HDFS client. +* `core-site.xml`, which sets the default filesystem name. + +The location of these configuration files varies across CDH and HDP versions, but +a common location is inside of `/etc/hadoop/conf`. Some tools, such as Cloudera Manager, create +configurations on-the-fly, but offer a mechanisms to download copies of them. + +To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh` +to a location containing the configuration files. http://git-wip-us.apache.org/repos/asf/spark/blob/643c49c7/docs/hadoop-third-party-distributions.md -- diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md deleted file mode 100644 index 795dd82..000 --- a/docs/hadoop-third-party-distributions.md +++ /dev/null @@ -1,117 +0,0 @@ -layout: global -title: Third-Party Hadoop Distributions - -Spark can run against all versions of Cloudera's D
spark git commit: [SPARK-11271][SPARK-11016][CORE] Use Spark BitSet instead of RoaringBitmap to reduce memory usage
Repository: spark Updated Branches: refs/heads/master e963070c1 -> e209fa271 [SPARK-11271][SPARK-11016][CORE] Use Spark BitSet instead of RoaringBitmap to reduce memory usage JIRA: https://issues.apache.org/jira/browse/SPARK-11271 As reported in the JIRA ticket, when there are too many tasks, the memory usage of MapStatus will cause problem. Use BitSet instead of RoaringBitMap should be more efficient in memory usage. Author: Liang-Chi Hsieh Closes #9243 from viirya/mapstatus-bitset. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e209fa27 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e209fa27 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e209fa27 Branch: refs/heads/master Commit: e209fa271ae57dc8849f8b1241bf1ea7d6d3d62c Parents: e963070 Author: Liang-Chi Hsieh Authored: Mon Nov 2 08:52:52 2015 + Committer: Sean Owen Committed: Mon Nov 2 08:52:52 2015 + -- core/pom.xml| 4 -- .../org/apache/spark/scheduler/MapStatus.scala | 13 +++--- .../spark/serializer/KryoSerializer.scala | 10 +--- .../apache/spark/util/collection/BitSet.scala | 28 +-- .../spark/serializer/KryoSerializerSuite.scala | 6 --- .../spark/util/collection/BitSetSuite.scala | 49 pom.xml | 5 -- 7 files changed, 82 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e209fa27/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 319a500..1b6b135 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -174,10 +174,6 @@ lz4 - org.roaringbitmap - RoaringBitmap - - commons-net commons-net http://git-wip-us.apache.org/repos/asf/spark/blob/e209fa27/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala index 1efce12..180c8d1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala @@ -19,9 +19,8 @@ package org.apache.spark.scheduler import java.io.{Externalizable, ObjectInput, ObjectOutput} -import org.roaringbitmap.RoaringBitmap - import org.apache.spark.storage.BlockManagerId +import org.apache.spark.util.collection.BitSet import org.apache.spark.util.Utils /** @@ -133,7 +132,7 @@ private[spark] class CompressedMapStatus( private[spark] class HighlyCompressedMapStatus private ( private[this] var loc: BlockManagerId, private[this] var numNonEmptyBlocks: Int, -private[this] var emptyBlocks: RoaringBitmap, +private[this] var emptyBlocks: BitSet, private[this] var avgSize: Long) extends MapStatus with Externalizable { @@ -146,7 +145,7 @@ private[spark] class HighlyCompressedMapStatus private ( override def location: BlockManagerId = loc override def getSizeForBlock(reduceId: Int): Long = { -if (emptyBlocks.contains(reduceId)) { +if (emptyBlocks.get(reduceId)) { 0 } else { avgSize @@ -161,7 +160,7 @@ private[spark] class HighlyCompressedMapStatus private ( override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { loc = BlockManagerId(in) -emptyBlocks = new RoaringBitmap() +emptyBlocks = new BitSet emptyBlocks.readExternal(in) avgSize = in.readLong() } @@ -177,15 +176,15 @@ private[spark] object HighlyCompressedMapStatus { // From a compression standpoint, it shouldn't matter whether we track empty or non-empty // blocks. From a performance standpoint, we benefit from tracking empty blocks because // we expect that there will be far fewer of them, so we will perform fewer bitmap insertions. -val emptyBlocks = new RoaringBitmap() val totalNumBlocks = uncompressedSizes.length +val emptyBlocks = new BitSet(totalNumBlocks) while (i < totalNumBlocks) { var size = uncompressedSizes(i) if (size > 0) { numNonEmptyBlocks += 1 totalSize += size } else { -emptyBlocks.add(i) +emptyBlocks.set(i) } i += 1 } http://git-wip-us.apache.org/repos/asf/spark/blob/e209fa27/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala -- diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index c5195c1..bc51d4f 100644 --- a/core/src/main/sc
spark git commit: [SPARK-11413][BUILD] Bump joda-time version to 2.9 for java 8 and s3
Repository: spark Updated Branches: refs/heads/master e209fa271 -> ea4a3e7d0 [SPARK-11413][BUILD] Bump joda-time version to 2.9 for java 8 and s3 It's a known issue that joda-time before 2.8.1 is incompatible with java 1.8u60 or later, which causes s3 request to fail. This affects Spark when using s3 as data source. https://github.com/aws/aws-sdk-java/issues/444 Author: Yongjia Wang Closes #9379 from yongjiaw/SPARK-11413. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea4a3e7d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea4a3e7d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea4a3e7d Branch: refs/heads/master Commit: ea4a3e7d06dd4a0f669460513b27469c468214fb Parents: e209fa2 Author: Yongjia Wang Authored: Mon Nov 2 08:59:35 2015 + Committer: Sean Owen Committed: Mon Nov 2 08:59:35 2015 + -- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ea4a3e7d/pom.xml -- diff --git a/pom.xml b/pom.xml index 50c8f29..762bfc7 100644 --- a/pom.xml +++ b/pom.xml @@ -176,7 +176,7 @@ 3.2.10 2.7.8 1.9 -2.5 +2.9 3.5.2 1.3.9 0.9.2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11344] Made ApplicationDescription and DriverDescription case classes
Repository: spark Updated Branches: refs/heads/master b86f2cab6 -> 233e534ac [SPARK-11344] Made ApplicationDescription and DriverDescription case classes DriverDescription refactored to case class because it included no mutable fields. ApplicationDescription had one mutable field, which was appUiUrl. This field was set by the driver to point to the driver web UI. Master was modifying this field when the application was removed to redirect requests to history server. This was wrong because objects which are sent over the wire should be immutable. Now appUiUrl is immutable in ApplicationDescription and always points to the driver UI even if it is already shutdown. The UI url which master exposes to the user and modifies dynamically is now included into ApplicationInfo - a data object which describes the application state internally in master. That URL in ApplicationInfo is initialised with the value from ApplicationDescription. ApplicationDescription also included value user, which is now a part of case class fields. Author: Jacek Lewandowski Closes #9299 from jacek-lewandowski/SPARK-11344. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/233e534a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/233e534a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/233e534a Branch: refs/heads/master Commit: 233e534ac43ea25ac1b0e6a985f6928d46c5d03a Parents: b86f2ca Author: Jacek Lewandowski Authored: Tue Nov 3 12:46:11 2015 + Committer: Sean Owen Committed: Tue Nov 3 12:46:11 2015 + -- .../spark/deploy/ApplicationDescription.scala | 33 ++-- .../apache/spark/deploy/DriverDescription.scala | 21 - .../spark/deploy/master/ApplicationInfo.scala | 7 + .../org/apache/spark/deploy/master/Master.scala | 12 --- .../deploy/master/ui/ApplicationPage.scala | 2 +- .../spark/deploy/master/ui/MasterPage.scala | 2 +- .../apache/spark/deploy/DeployTestUtils.scala | 3 +- 7 files changed, 34 insertions(+), 46 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/233e534a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala index ae99432..78bbd5c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala @@ -19,30 +19,17 @@ package org.apache.spark.deploy import java.net.URI -private[spark] class ApplicationDescription( -val name: String, -val maxCores: Option[Int], -val memoryPerExecutorMB: Int, -val command: Command, -var appUiUrl: String, -val eventLogDir: Option[URI] = None, +private[spark] case class ApplicationDescription( +name: String, +maxCores: Option[Int], +memoryPerExecutorMB: Int, +command: Command, +appUiUrl: String, +eventLogDir: Option[URI] = None, // short name of compression codec used when writing event logs, if any (e.g. lzf) -val eventLogCodec: Option[String] = None, -val coresPerExecutor: Option[Int] = None) - extends Serializable { - - val user = System.getProperty("user.name", "") - - def copy( - name: String = name, - maxCores: Option[Int] = maxCores, - memoryPerExecutorMB: Int = memoryPerExecutorMB, - command: Command = command, - appUiUrl: String = appUiUrl, - eventLogDir: Option[URI] = eventLogDir, - eventLogCodec: Option[String] = eventLogCodec): ApplicationDescription = -new ApplicationDescription( - name, maxCores, memoryPerExecutorMB, command, appUiUrl, eventLogDir, eventLogCodec) +eventLogCodec: Option[String] = None, +coresPerExecutor: Option[Int] = None, +user: String = System.getProperty("user.name", "")) { override def toString: String = "ApplicationDescription(" + name + ")" } http://git-wip-us.apache.org/repos/asf/spark/blob/233e534a/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala b/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala index 659fb43..1f5626a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala +++ b/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala @@ -17,21 +17,12 @@ package org.apache.spark.deploy -private[deploy] class DriverDescription( -val jarUrl: String, -val mem: Int, -val cores: Int, -val supervise: Boolean, -val c
spark git commit: [SPARK-2960][DEPLOY] Support executing Spark from symlinks (reopen)
Repository: spark Updated Branches: refs/heads/master 2692bdb7d -> 8aff36e91 [SPARK-2960][DEPLOY] Support executing Spark from symlinks (reopen) This PR is based on the work of roji to support running Spark scripts from symlinks. Thanks for the great work roji . Would you mind taking a look at this PR, thanks a lot. For releases like HDP and others, normally it will expose the Spark executables as symlinks and put in `PATH`, but current Spark's scripts do not support finding real path from symlink recursively, this will make spark fail to execute from symlink. This PR try to solve this issue by finding the absolute path from symlink. Instead of using `readlink -f` like what this PR (https://github.com/apache/spark/pull/2386) implemented is that `-f` is not support for Mac, so here manually seeking the path through loop. I've tested with Mac and Linux (Cent OS), looks fine. This PR did not fix the scripts under `sbin` folder, not sure if it needs to be fixed also? Please help to review, any comment is greatly appreciated. Author: jerryshao Author: Shay Rojansky Closes #8669 from jerryshao/SPARK-2960. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8aff36e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8aff36e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8aff36e9 Branch: refs/heads/master Commit: 8aff36e91de0fee2f3f56c6d240bb203b5bb48ba Parents: 2692bdb Author: jerryshao Authored: Wed Nov 4 10:49:34 2015 + Committer: Sean Owen Committed: Wed Nov 4 10:49:34 2015 + -- bin/beeline | 8 +--- bin/load-spark-env.sh | 32 ++-- bin/pyspark | 14 -- bin/run-example | 18 ++ bin/spark-class | 15 --- bin/spark-shell | 9 ++--- bin/spark-sql | 7 +-- bin/spark-submit| 6 -- bin/sparkR | 9 ++--- sbin/slaves.sh | 9 + sbin/spark-config.sh| 23 +++ sbin/spark-daemon.sh| 23 --- sbin/spark-daemons.sh | 9 + sbin/start-all.sh | 11 ++- sbin/start-history-server.sh| 11 ++- sbin/start-master.sh| 17 + sbin/start-mesos-dispatcher.sh | 11 ++- sbin/start-mesos-shuffle-service.sh | 11 ++- sbin/start-shuffle-service.sh | 11 ++- sbin/start-slave.sh | 18 +- sbin/start-slaves.sh| 19 +-- sbin/start-thriftserver.sh | 11 ++- sbin/stop-all.sh| 14 +++--- sbin/stop-history-server.sh | 7 --- sbin/stop-master.sh | 13 +++-- sbin/stop-mesos-dispatcher.sh | 9 + sbin/stop-mesos-shuffle-service.sh | 7 --- sbin/stop-shuffle-service.sh| 7 --- sbin/stop-slave.sh | 15 --- sbin/stop-slaves.sh | 15 --- sbin/stop-thriftserver.sh | 7 --- 31 files changed, 213 insertions(+), 183 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8aff36e9/bin/beeline -- diff --git a/bin/beeline b/bin/beeline index 3fcb6df..1627626 100755 --- a/bin/beeline +++ b/bin/beeline @@ -23,8 +23,10 @@ # Enter posix mode for bash set -o posix -# Figure out where Spark is installed -FWDIR="$(cd "`dirname "$0"`"/..; pwd)" +# Figure out if SPARK_HOME is set +if [ -z "${SPARK_HOME}" ]; then + export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" +fi CLASS="org.apache.hive.beeline.BeeLine" -exec "$FWDIR/bin/spark-class" $CLASS "$@" +exec "${SPARK_HOME}/bin/spark-class" $CLASS "$@" http://git-wip-us.apache.org/repos/asf/spark/blob/8aff36e9/bin/load-spark-env.sh -- diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh index 95779e9..eaea964 100644 --- a/bin/load-spark-env.sh +++ b/bin/load-spark-env.sh @@ -20,13 +20,17 @@ # This script loads spark-env.sh if it exists, and ensures it is only loaded once. # spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's # conf/ subdirectory. -FWDIR="$(cd "`dirname "$0"`"/..; pwd)" + +# Figure out where Spark is installed +if [ -z "${SPARK_HOME}" ]; then + export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" +fi if [ -z "$SPARK_ENV_LOADED" ]; then export SPARK_ENV_LOADED=1 # Returns the parent of the directory this
spark git commit: [SPARK-11442] Reduce numSlices for local metrics test of SparkListenerSuite
Repository: spark Updated Branches: refs/heads/master 8aff36e91 -> c09e51398 [SPARK-11442] Reduce numSlices for local metrics test of SparkListenerSuite In the thread, http://search-hadoop.com/m/q3RTtcQiFSlTxeP/test+failed+due+to+OOME&subj=test+failed+due+to+OOME, it was discussed that memory consumption for SparkListenerSuite should be brought down. This is an attempt in that direction by reducing numSlices for local metrics test. Author: tedyu Closes #9384 from tedyu/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c09e5139 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c09e5139 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c09e5139 Branch: refs/heads/master Commit: c09e5139874fb3626e005c8240cca5308b902ef3 Parents: 8aff36e Author: tedyu Authored: Wed Nov 4 10:51:40 2015 + Committer: Sean Owen Committed: Wed Nov 4 10:51:40 2015 + -- .../org/apache/spark/scheduler/SparkListenerSuite.scala | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c09e5139/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala index a9652d7..53102b9 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala @@ -212,14 +212,15 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match i } -val d = sc.parallelize(0 to 1e4.toInt, 64).map(w) +val numSlices = 16 +val d = sc.parallelize(0 to 1e3.toInt, numSlices).map(w) d.count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) listener.stageInfos.size should be (1) val d2 = d.map { i => w(i) -> i * 2 }.setName("shuffle input 1") val d3 = d.map { i => w(i) -> (0 to (i % 5)) }.setName("shuffle input 2") -val d4 = d2.cogroup(d3, 64).map { case (k, (v1, v2)) => +val d4 = d2.cogroup(d3, numSlices).map { case (k, (v1, v2)) => w(k) -> (v1.size, v2.size) } d4.setName("A Cogroup") @@ -258,8 +259,8 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match if (stageInfo.rddInfos.exists(_.name == d4.name)) { taskMetrics.shuffleReadMetrics should be ('defined) val sm = taskMetrics.shuffleReadMetrics.get - sm.totalBlocksFetched should be (128) - sm.localBlocksFetched should be (128) + sm.totalBlocksFetched should be (2*numSlices) + sm.localBlocksFetched should be (2*numSlices) sm.remoteBlocksFetched should be (0) sm.remoteBytesRead should be (0L) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11440][CORE][STREAMING][BUILD] Declare rest of @Experimental items non-experimental if they've existed since 1.2.0
Repository: spark Updated Branches: refs/heads/master 81498dd5c -> 6f81eae24 [SPARK-11440][CORE][STREAMING][BUILD] Declare rest of @Experimental items non-experimental if they've existed since 1.2.0 Remove `Experimental` annotations in core, streaming for items that existed in 1.2.0 or before. The changes are: * SparkContext * binary{Files,Records} : 1.2.0 * submitJob : 1.0.0 * JavaSparkContext * binary{Files,Records} : 1.2.0 * DoubleRDDFunctions, JavaDoubleRDD * {mean,sum}Approx : 1.0.0 * PairRDDFunctions, JavaPairRDD * sampleByKeyExact : 1.2.0 * countByKeyApprox : 1.0.0 * PairRDDFunctions * countApproxDistinctByKey : 1.1.0 * RDD * countApprox, countByValueApprox, countApproxDistinct : 1.0.0 * JavaRDDLike * countApprox : 1.0.0 * PythonHadoopUtil.Converter : 1.1.0 * PortableDataStream : 1.2.0 (related to binaryFiles) * BoundedDouble : 1.0.0 * PartialResult : 1.0.0 * StreamingContext, JavaStreamingContext * binaryRecordsStream : 1.2.0 * HiveContext * analyze : 1.2.0 Author: Sean Owen Closes #9396 from srowen/SPARK-11440. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f81eae2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f81eae2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f81eae2 Branch: refs/heads/master Commit: 6f81eae24f83df51a99d4bb2629dd7daadc01519 Parents: 81498dd Author: Sean Owen Authored: Thu Nov 5 09:08:53 2015 + Committer: Sean Owen Committed: Thu Nov 5 09:08:53 2015 + -- core/src/main/scala/org/apache/spark/SparkContext.scala | 10 +- .../scala/org/apache/spark/api/java/JavaDoubleRDD.scala | 7 --- .../scala/org/apache/spark/api/java/JavaPairRDD.scala | 9 - .../scala/org/apache/spark/api/java/JavaRDDLike.scala | 5 - .../org/apache/spark/api/java/JavaSparkContext.scala | 7 --- .../org/apache/spark/api/python/PythonHadoopUtil.scala| 3 --- .../scala/org/apache/spark/input/PortableDataStream.scala | 2 -- .../scala/org/apache/spark/partial/BoundedDouble.scala| 4 .../scala/org/apache/spark/partial/PartialResult.scala| 3 --- .../scala/org/apache/spark/rdd/DoubleRDDFunctions.scala | 4 .../scala/org/apache/spark/rdd/PairRDDFunctions.scala | 7 --- core/src/main/scala/org/apache/spark/rdd/RDD.scala| 8 +--- .../scala/org/apache/spark/sql/hive/HiveContext.scala | 2 -- .../org/apache/spark/streaming/StreamingContext.scala | 3 --- .../spark/streaming/api/java/JavaStreamingContext.scala | 3 --- 15 files changed, 2 insertions(+), 75 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f81eae2/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index a6857b4..7421821 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -45,7 +45,7 @@ import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFor import org.apache.mesos.MesosNativeLibrary -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil} import org.apache.spark.executor.{ExecutorEndpoint, TriggerThreadDump} @@ -870,8 +870,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli } /** - * :: Experimental :: - * * Get an RDD for a Hadoop-readable dataset as PortableDataStream for each file * (useful for binary data) * @@ -902,7 +900,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ - @Experimental def binaryFiles( path: String, minPartitions: Int = defaultMinPartitions): RDD[(String, PortableDataStream)] = withScope { @@ -922,8 +919,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli } /** - * :: Experimental :: - * * Load data from a flat binary file, assuming the length of each record is constant. * * '''Note:''' We ensure that the byte array for each record in the resulting RDD @@ -936,7 +931,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * * @return An RDD of data with values, represented as byte arrays */ - @Experimental def binaryRecords( path: String,
spark git commit: [SPARK-11378][STREAMING] make StreamingContext.awaitTerminationOrTimeout return properly
Repository: spark Updated Branches: refs/heads/master 6f81eae24 -> 859dff56e [SPARK-11378][STREAMING] make StreamingContext.awaitTerminationOrTimeout return properly This adds a failing test checking that `awaitTerminationOrTimeout` returns the expected value, and then fixes that failing test with the addition of a `return`. tdas zsxwing Author: Nick Evans Closes #9336 from manygrams/fix_await_termination_or_timeout. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/859dff56 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/859dff56 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/859dff56 Branch: refs/heads/master Commit: 859dff56eb0f8c63c86e7e900a12340c199e6247 Parents: 6f81eae Author: Nick Evans Authored: Thu Nov 5 09:18:20 2015 + Committer: Sean Owen Committed: Thu Nov 5 09:18:20 2015 + -- python/pyspark/streaming/context.py | 2 +- python/pyspark/streaming/tests.py | 7 +++ 2 files changed, 8 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/859dff56/python/pyspark/streaming/context.py -- diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index 975c754..8be56c9 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -218,7 +218,7 @@ class StreamingContext(object): @param timeout: time to wait in seconds """ -self._jssc.awaitTerminationOrTimeout(int(timeout * 1000)) +return self._jssc.awaitTerminationOrTimeout(int(timeout * 1000)) def stop(self, stopSparkContext=True, stopGraceFully=False): """ http://git-wip-us.apache.org/repos/asf/spark/blob/859dff56/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index f7fa481..1794796 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -596,6 +596,13 @@ class StreamingContextTests(PySparkStreamingTestCase): self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) +def test_await_termination_or_timeout(self): +self._add_input_stream() +self.ssc.start() +self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001)) +self.ssc.stop(False) +self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001)) + class CheckpointTests(unittest.TestCase): - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11378][STREAMING] make StreamingContext.awaitTerminationOrTimeout return properly
Repository: spark Updated Branches: refs/heads/branch-1.5 d31b312fc -> 9522dd23d [SPARK-11378][STREAMING] make StreamingContext.awaitTerminationOrTimeout return properly This adds a failing test checking that `awaitTerminationOrTimeout` returns the expected value, and then fixes that failing test with the addition of a `return`. tdas zsxwing Author: Nick Evans Closes #9336 from manygrams/fix_await_termination_or_timeout. (cherry picked from commit 859dff56eb0f8c63c86e7e900a12340c199e6247) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9522dd23 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9522dd23 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9522dd23 Branch: refs/heads/branch-1.5 Commit: 9522dd23d5b059c76f32b0a288389b03b6c971a8 Parents: d31b312 Author: Nick Evans Authored: Thu Nov 5 09:18:20 2015 + Committer: Sean Owen Committed: Thu Nov 5 09:18:33 2015 + -- python/pyspark/streaming/context.py | 2 +- python/pyspark/streaming/tests.py | 7 +++ 2 files changed, 8 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9522dd23/python/pyspark/streaming/context.py -- diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index a8c9ffc..3a8f949 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -256,7 +256,7 @@ class StreamingContext(object): @param timeout: time to wait in seconds """ -self._jssc.awaitTerminationOrTimeout(int(timeout * 1000)) +return self._jssc.awaitTerminationOrTimeout(int(timeout * 1000)) def stop(self, stopSparkContext=True, stopGraceFully=False): """ http://git-wip-us.apache.org/repos/asf/spark/blob/9522dd23/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index a8c7b51..824f356 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -585,6 +585,13 @@ class StreamingContextTests(PySparkStreamingTestCase): self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) +def test_await_termination_or_timeout(self): +self._add_input_stream() +self.ssc.start() +self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001)) +self.ssc.stop(False) +self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001)) + class CheckpointTests(unittest.TestCase): - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11449][CORE] PortableDataStream should be a factory
Repository: spark Updated Branches: refs/heads/master 859dff56e -> 7bdc92197 [SPARK-11449][CORE] PortableDataStream should be a factory ```PortableDataStream``` maintains some internal state. This makes it tricky to reuse a stream (one needs to call ```close``` on both the ```PortableDataStream``` and the ```InputStream``` it produces). This PR removes all state from ```PortableDataStream``` and effectively turns it into an ```InputStream```/```Array[Byte]``` factory. This makes the user responsible for managing the ```InputStream``` it returns. cc srowen Author: Herman van Hovell Closes #9417 from hvanhovell/SPARK-11449. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7bdc9219 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7bdc9219 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7bdc9219 Branch: refs/heads/master Commit: 7bdc92197cce0edc0110dc9c2158e6e3f42c72ee Parents: 859dff5 Author: Herman van Hovell Authored: Thu Nov 5 09:23:09 2015 + Committer: Sean Owen Committed: Thu Nov 5 09:23:09 2015 + -- .../apache/spark/input/PortableDataStream.scala | 45 +++- 1 file changed, 16 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7bdc9219/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala -- diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala index 33e4ee0..280e7a5 100644 --- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala +++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala @@ -21,7 +21,7 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, Da import scala.collection.JavaConverters._ -import com.google.common.io.ByteStreams +import com.google.common.io.{Closeables, ByteStreams} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} @@ -82,7 +82,6 @@ private[spark] abstract class StreamBasedRecordReader[T]( if (!processed) { val fileIn = new PortableDataStream(split, context, index) value = parseStream(fileIn) - fileIn.close() // if it has not been open yet, close does nothing key = fileIn.getPath processed = true true @@ -134,12 +133,6 @@ class PortableDataStream( index: Integer) extends Serializable { - // transient forces file to be reopened after being serialization - // it is also used for non-serializable classes - - @transient private var fileIn: DataInputStream = null - @transient private var isOpen = false - private val confBytes = { val baos = new ByteArrayOutputStream() SparkHadoopUtil.get.getConfigurationFromJobContext(context). @@ -175,40 +168,34 @@ class PortableDataStream( } /** - * Create a new DataInputStream from the split and context + * Create a new DataInputStream from the split and context. The user of this method is responsible + * for closing the stream after usage. */ def open(): DataInputStream = { -if (!isOpen) { - val pathp = split.getPath(index) - val fs = pathp.getFileSystem(conf) - fileIn = fs.open(pathp) - isOpen = true -} -fileIn +val pathp = split.getPath(index) +val fs = pathp.getFileSystem(conf) +fs.open(pathp) } /** * Read the file as a byte array */ def toArray(): Array[Byte] = { -open() -val innerBuffer = ByteStreams.toByteArray(fileIn) -close() -innerBuffer +val stream = open() +try { + ByteStreams.toByteArray(stream) +} finally { + Closeables.close(stream, true) +} } /** - * Close the file (if it is currently open) + * Closing the PortableDataStream is not needed anymore. The user either can use the + * PortableDataStream to get a DataInputStream (which the user needs to close after usage), + * or a byte array. */ + @deprecated("Closing the PortableDataStream is not needed anymore.", "1.6.0") def close(): Unit = { -if (isOpen) { - try { -fileIn.close() -isOpen = false - } catch { -case ioe: java.io.IOException => // do nothing - } -} } def getPath(): String = path - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11506][MLLIB] Removed redundant operation in Online LDA implementation
Repository: spark Updated Branches: refs/heads/master 7bdc92197 -> a94671a02 [SPARK-11506][MLLIB] Removed redundant operation in Online LDA implementation In file LDAOptimizer.scala: line 441: since "idx" was never used, replaced unrequired zipWithIndex.foreach with foreach. - nonEmptyDocs.zipWithIndex.foreach { case ((_, termCounts: Vector), idx: Int) => + nonEmptyDocs.foreach { case (_, termCounts: Vector) => Author: a1singh Closes #9456 from a1singh/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a94671a0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a94671a0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a94671a0 Branch: refs/heads/master Commit: a94671a027c29bacea37f56b95eccb115638abff Parents: 7bdc921 Author: a1singh Authored: Thu Nov 5 12:51:10 2015 + Committer: Sean Owen Committed: Thu Nov 5 12:51:10 2015 + -- .../scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a94671a0/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 38486e9..17c0609 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -438,7 +438,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val stat = BDM.zeros[Double](k, vocabSize) var gammaPart = List[BDV[Double]]() - nonEmptyDocs.zipWithIndex.foreach { case ((_, termCounts: Vector), idx: Int) => + nonEmptyDocs.foreach { case (_, termCounts: Vector) => val ids: List[Int] = termCounts match { case v: DenseVector => (0 until v.size).toList case v: SparseVector => v.indices.toList - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11511][STREAMING] Fix NPE when an InputDStream is not used
Repository: spark Updated Branches: refs/heads/master 253e87e8a -> cf69ce136 [SPARK-11511][STREAMING] Fix NPE when an InputDStream is not used Just ignored `InputDStream`s that have null `rememberDuration` in `DStreamGraph.getMaxInputStreamRememberDuration`. Author: Shixiong Zhu Closes #9476 from zsxwing/SPARK-11511. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf69ce13 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf69ce13 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf69ce13 Branch: refs/heads/master Commit: cf69ce136590fea51843bc54f44f0f45c7d0ac36 Parents: 253e87e Author: Shixiong Zhu Authored: Fri Nov 6 14:51:53 2015 + Committer: Sean Owen Committed: Fri Nov 6 14:51:53 2015 + -- .../org/apache/spark/streaming/DStreamGraph.scala | 3 ++- .../spark/streaming/StreamingContextSuite.scala | 16 2 files changed, 18 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf69ce13/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala index 1b0b789..7829f5e 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala @@ -167,7 +167,8 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { * safe remember duration which can be used to perform cleanup operations. */ def getMaxInputStreamRememberDuration(): Duration = { -inputStreams.map { _.rememberDuration }.maxBy { _.milliseconds } +// If an InputDStream is not used, its `rememberDuration` will be null and we can ignore them +inputStreams.map(_.rememberDuration).filter(_ != null).maxBy(_.milliseconds) } @throws(classOf[IOException]) http://git-wip-us.apache.org/repos/asf/spark/blob/cf69ce13/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index c7a8771..860fac2 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -780,6 +780,22 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo "Please don't use queueStream when checkpointing is enabled.")) } + test("Creating an InputDStream but not using it should not crash") { +ssc = new StreamingContext(master, appName, batchDuration) +val input1 = addInputStream(ssc) +val input2 = addInputStream(ssc) +val output = new TestOutputStream(input2) +output.register() +val batchCount = new BatchCounter(ssc) +ssc.start() +// Just wait for completing 2 batches to make sure it triggers +// `DStream.getMaxInputStreamRememberDuration` +batchCount.waitUntilBatchesCompleted(2, 1) +// Throw the exception if crash +ssc.awaitTerminationOrTimeout(1) +ssc.stop() + } + def addInputStream(s: StreamingContext): DStream[Int] = { val input = (1 to 100).map(i => 1 to i) val inputStream = new TestInputStream(s, input, 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11511][STREAMING] Fix NPE when an InputDStream is not used
Repository: spark Updated Branches: refs/heads/branch-1.6 1cfad7d55 -> 0a430f04e [SPARK-11511][STREAMING] Fix NPE when an InputDStream is not used Just ignored `InputDStream`s that have null `rememberDuration` in `DStreamGraph.getMaxInputStreamRememberDuration`. Author: Shixiong Zhu Closes #9476 from zsxwing/SPARK-11511. (cherry picked from commit cf69ce136590fea51843bc54f44f0f45c7d0ac36) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a430f04 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a430f04 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a430f04 Branch: refs/heads/branch-1.6 Commit: 0a430f04eef3445fb0095adc806d91759eea5d32 Parents: 1cfad7d Author: Shixiong Zhu Authored: Fri Nov 6 14:51:53 2015 + Committer: Sean Owen Committed: Fri Nov 6 14:52:08 2015 + -- .../org/apache/spark/streaming/DStreamGraph.scala | 3 ++- .../spark/streaming/StreamingContextSuite.scala | 16 2 files changed, 18 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a430f04/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala index 1b0b789..7829f5e 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala @@ -167,7 +167,8 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { * safe remember duration which can be used to perform cleanup operations. */ def getMaxInputStreamRememberDuration(): Duration = { -inputStreams.map { _.rememberDuration }.maxBy { _.milliseconds } +// If an InputDStream is not used, its `rememberDuration` will be null and we can ignore them +inputStreams.map(_.rememberDuration).filter(_ != null).maxBy(_.milliseconds) } @throws(classOf[IOException]) http://git-wip-us.apache.org/repos/asf/spark/blob/0a430f04/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index c7a8771..860fac2 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -780,6 +780,22 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo "Please don't use queueStream when checkpointing is enabled.")) } + test("Creating an InputDStream but not using it should not crash") { +ssc = new StreamingContext(master, appName, batchDuration) +val input1 = addInputStream(ssc) +val input2 = addInputStream(ssc) +val output = new TestOutputStream(input2) +output.register() +val batchCount = new BatchCounter(ssc) +ssc.start() +// Just wait for completing 2 batches to make sure it triggers +// `DStream.getMaxInputStreamRememberDuration` +batchCount.waitUntilBatchesCompleted(2, 1) +// Throw the exception if crash +ssc.awaitTerminationOrTimeout(1) +ssc.stop() + } + def addInputStream(s: StreamingContext): DStream[Int] = { val input = (1 to 100).map(i => 1 to i) val inputStream = new TestInputStream(s, input, 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org