spark git commit: [SPARK-7235] [SQL] Refactor the grouping sets
Repository: spark Updated Branches: refs/heads/master 4f7fbefb8 - 7b1450b66 [SPARK-7235] [SQL] Refactor the grouping sets The logical plan `Expand` takes the `output` as constructor argument, which break the references chain. We need to refactor the code, as well as the column pruning. Author: Cheng Hao hao.ch...@intel.com Closes #5780 from chenghao-intel/expand and squashes the following commits: 76e4aa4 [Cheng Hao] revert the change for case insenstive 7c10a83 [Cheng Hao] refactor the grouping sets Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7b1450b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7b1450b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7b1450b6 Branch: refs/heads/master Commit: 7b1450b666f88452e7fe969a6d59e8b24842ea39 Parents: 4f7fbef Author: Cheng Hao hao.ch...@intel.com Authored: Tue Jun 23 10:52:17 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Tue Jun 23 10:52:17 2015 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 55 +++-- .../catalyst/expressions/namedExpressions.scala | 2 +- .../sql/catalyst/optimizer/Optimizer.scala | 4 + .../catalyst/plans/logical/basicOperators.scala | 84 +++- .../spark/sql/execution/SparkStrategies.scala | 4 +- 5 files changed, 78 insertions(+), 71 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7b1450b6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 6311784..0a3f5a7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -192,49 +192,17 @@ class Analyzer( Seq.tabulate(1 c.groupByExprs.length)(i = i) } -/** - * Create an array of Projections for the child projection, and replace the projections' - * expressions which equal GroupBy expressions with Literal(null), if those expressions - * are not set for this grouping set (according to the bit mask). - */ -private[this] def expand(g: GroupingSets): Seq[Seq[Expression]] = { - val result = new scala.collection.mutable.ArrayBuffer[Seq[Expression]] - - g.bitmasks.foreach { bitmask = -// get the non selected grouping attributes according to the bit mask -val nonSelectedGroupExprs = ArrayBuffer.empty[Expression] -var bit = g.groupByExprs.length - 1 -while (bit = 0) { - if (((bitmask bit) 1) == 0) nonSelectedGroupExprs += g.groupByExprs(bit) - bit -= 1 -} - -val substitution = (g.child.output :+ g.gid).map(expr = expr transformDown { - case x: Expression if nonSelectedGroupExprs.find(_ semanticEquals x).isDefined = -// if the input attribute in the Invalid Grouping Expression set of for this group -// replace it with constant null -Literal.create(null, expr.dataType) - case x if x == g.gid = -// replace the groupingId with concrete value (the bit mask) -Literal.create(bitmask, IntegerType) -}) - -result += substitution - } - - result.toSeq -} - def apply(plan: LogicalPlan): LogicalPlan = plan transform { - case a: Cube if a.resolved = -GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid) - case a: Rollup if a.resolved = -GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, a.gid) - case x: GroupingSets if x.resolved = + case a: Cube = +GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations) + case a: Rollup = +GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations) + case x: GroupingSets = +val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)() Aggregate( - x.groupByExprs :+ x.gid, + x.groupByExprs :+ VirtualColumn.groupingIdAttribute, x.aggregations, - Expand(expand(x), x.child.output :+ x.gid, x.child)) + Expand(x.bitmasks, x.groupByExprs, gid, x.child)) } } @@ -368,12 +336,7 @@ class Analyzer( case q: LogicalPlan = logTrace(sAttempting to resolve ${q.simpleString}) -q transformExpressionsUp { - case u @ UnresolvedAttribute(nameParts) if nameParts.length == 1 -resolver(nameParts(0), VirtualColumn.groupingIdName) -q.isInstanceOf[GroupingAnalytics] = -
spark git commit: [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer
Repository: spark Updated Branches: refs/heads/branch-1.4 929479675 - 334824505 [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer Author: Holden Karau hol...@pigscanfly.ca Closes #6918 from holdenk/SPARK-8498-fix-npe-in-errorhandling-path-in-unsafeshuffle-writer and squashes the following commits: f807832 [Holden Karau] Log error if we can't throw it 855f9aa [Holden Karau] Spelling - not my strongest suite. Fix Propegates to Propagates. 039d620 [Holden Karau] Add missing closeandwriteoutput 30e558d [Holden Karau] go back to try/finally e503b8c [Holden Karau] Improve the test to ensure we aren't masking the underlying exception ae0b7a7 [Holden Karau] Fix the test 2e6abf7 [Holden Karau] Be more cautious when cleaning up during failed write and re-throw user exceptions (cherry picked from commit 0f92be5b5f017b593bd29d4da7e89aad2b3adac2) Signed-off-by: Josh Rosen joshro...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33482450 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33482450 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33482450 Branch: refs/heads/branch-1.4 Commit: 33482450559028b907a0473277297cc54429322e Parents: 9294796 Author: Holden Karau hol...@pigscanfly.ca Authored: Tue Jun 23 09:08:11 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Tue Jun 23 09:08:49 2015 -0700 -- .../spark/shuffle/unsafe/UnsafeShuffleWriter.java | 18 -- .../shuffle/unsafe/UnsafeShuffleWriterSuite.java | 17 + 2 files changed, 33 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/33482450/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java -- diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java index ad7eb04..764578b 100644 --- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java @@ -139,6 +139,9 @@ public class UnsafeShuffleWriterK, V extends ShuffleWriterK, V { @Override public void write(scala.collection.IteratorProduct2K, V records) throws IOException { +// Keep track of success so we know if we ecountered an exception +// We do this rather than a standard try/catch/re-throw to handle +// generic throwables. boolean success = false; try { while (records.hasNext()) { @@ -147,8 +150,19 @@ public class UnsafeShuffleWriterK, V extends ShuffleWriterK, V { closeAndWriteOutput(); success = true; } finally { - if (!success) { -sorter.cleanupAfterError(); + if (sorter != null) { +try { + sorter.cleanupAfterError(); +} catch (Exception e) { + // Only throw this error if we won't be masking another + // error. + if (success) { +throw e; + } else { +logger.error(In addition to a failure during writing, we failed during + + cleanup., e); + } +} } } } http://git-wip-us.apache.org/repos/asf/spark/blob/33482450/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java -- diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java index 83d1091..10c3eed 100644 --- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java @@ -253,6 +253,23 @@ public class UnsafeShuffleWriterSuite { createWriter(false).stop(false); } + class PandaException extends RuntimeException { + } + + @Test(expected=PandaException.class) + public void writeFailurePropagates() throws Exception { +class BadRecords extends scala.collection.AbstractIteratorProduct2Object, Object { + @Override public boolean hasNext() { +throw new PandaException(); + } + @Override public Product2Object, Object next() { +return null; + } +} +final UnsafeShuffleWriterObject, Object writer = createWriter(true); +writer.write(new BadRecords()); + } + @Test public void writeEmptyIterator() throws Exception { final UnsafeShuffleWriterObject, Object writer = createWriter(true); - To unsubscribe, e-mail:
spark git commit: Revert [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer
Repository: spark Updated Branches: refs/heads/branch-1.4 334824505 - 77cb1d5ed Revert [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer This reverts commit 33482450559028b907a0473277297cc54429322e. Reverting because `catch (Exception e) ... throw e` doesn't compile under Java 6 unless the method declares that it throws Exception. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/77cb1d5e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/77cb1d5e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/77cb1d5e Branch: refs/heads/branch-1.4 Commit: 77cb1d5ed1d0343b512856e24d9c14556236364b Parents: 3348245 Author: Josh Rosen joshro...@databricks.com Authored: Tue Jun 23 09:19:11 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Tue Jun 23 09:19:11 2015 -0700 -- .../spark/shuffle/unsafe/UnsafeShuffleWriter.java | 18 ++ .../shuffle/unsafe/UnsafeShuffleWriterSuite.java | 17 - 2 files changed, 2 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/77cb1d5e/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java -- diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java index 764578b..ad7eb04 100644 --- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java @@ -139,9 +139,6 @@ public class UnsafeShuffleWriterK, V extends ShuffleWriterK, V { @Override public void write(scala.collection.IteratorProduct2K, V records) throws IOException { -// Keep track of success so we know if we ecountered an exception -// We do this rather than a standard try/catch/re-throw to handle -// generic throwables. boolean success = false; try { while (records.hasNext()) { @@ -150,19 +147,8 @@ public class UnsafeShuffleWriterK, V extends ShuffleWriterK, V { closeAndWriteOutput(); success = true; } finally { - if (sorter != null) { -try { - sorter.cleanupAfterError(); -} catch (Exception e) { - // Only throw this error if we won't be masking another - // error. - if (success) { -throw e; - } else { -logger.error(In addition to a failure during writing, we failed during + - cleanup., e); - } -} + if (!success) { +sorter.cleanupAfterError(); } } } http://git-wip-us.apache.org/repos/asf/spark/blob/77cb1d5e/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java -- diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java index 10c3eed..83d1091 100644 --- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java @@ -253,23 +253,6 @@ public class UnsafeShuffleWriterSuite { createWriter(false).stop(false); } - class PandaException extends RuntimeException { - } - - @Test(expected=PandaException.class) - public void writeFailurePropagates() throws Exception { -class BadRecords extends scala.collection.AbstractIteratorProduct2Object, Object { - @Override public boolean hasNext() { -throw new PandaException(); - } - @Override public Product2Object, Object next() { -return null; - } -} -final UnsafeShuffleWriterObject, Object writer = createWriter(true); -writer.write(new BadRecords()); - } - @Test public void writeEmptyIterator() throws Exception { final UnsafeShuffleWriterObject, Object writer = createWriter(true); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL] [DOCS] updated the documentation for explode
Repository: spark Updated Branches: refs/heads/branch-1.4 77cb1d5ed - 27693e175 [SQL] [DOCS] updated the documentation for explode the syntax was incorrect in the example in explode Author: lockwobr lockw...@gmail.com Closes #6943 from lockwobr/master and squashes the following commits: 3d864d1 [lockwobr] updated the documentation for explode (cherry picked from commit 4f7fbefb8db56ecaab66bb0ac2ab124416fefe58) Signed-off-by: Kousuke Saruta saru...@oss.nttdata.co.jp Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27693e17 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27693e17 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27693e17 Branch: refs/heads/branch-1.4 Commit: 27693e175715a7d6901ec79ab510b883a4cb0fb2 Parents: 77cb1d5 Author: lockwobr lockw...@gmail.com Authored: Wed Jun 24 02:48:56 2015 +0900 Committer: Kousuke Saruta saru...@oss.nttdata.co.jp Committed: Wed Jun 24 02:51:36 2015 +0900 -- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27693e17/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 2a01824..29bba18 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -999,7 +999,7 @@ class DataFrame private[sql]( * columns of the input row are implicitly joined with each value that is output by the function. * * {{{ - * df.explode(words, word)(words: String = words.split( )) + * df.explode(words, word){words: String = words.split( )} * }}} * @group dfops * @since 1.3.0 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer
Repository: spark Updated Branches: refs/heads/master 6ceb16960 - 0f92be5b5 [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer Author: Holden Karau hol...@pigscanfly.ca Closes #6918 from holdenk/SPARK-8498-fix-npe-in-errorhandling-path-in-unsafeshuffle-writer and squashes the following commits: f807832 [Holden Karau] Log error if we can't throw it 855f9aa [Holden Karau] Spelling - not my strongest suite. Fix Propegates to Propagates. 039d620 [Holden Karau] Add missing closeandwriteoutput 30e558d [Holden Karau] go back to try/finally e503b8c [Holden Karau] Improve the test to ensure we aren't masking the underlying exception ae0b7a7 [Holden Karau] Fix the test 2e6abf7 [Holden Karau] Be more cautious when cleaning up during failed write and re-throw user exceptions Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f92be5b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f92be5b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f92be5b Branch: refs/heads/master Commit: 0f92be5b5f017b593bd29d4da7e89aad2b3adac2 Parents: 6ceb169 Author: Holden Karau hol...@pigscanfly.ca Authored: Tue Jun 23 09:08:11 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Tue Jun 23 09:08:11 2015 -0700 -- .../spark/shuffle/unsafe/UnsafeShuffleWriter.java | 18 -- .../shuffle/unsafe/UnsafeShuffleWriterSuite.java | 17 + 2 files changed, 33 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f92be5b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java -- diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java index ad7eb04..764578b 100644 --- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java @@ -139,6 +139,9 @@ public class UnsafeShuffleWriterK, V extends ShuffleWriterK, V { @Override public void write(scala.collection.IteratorProduct2K, V records) throws IOException { +// Keep track of success so we know if we ecountered an exception +// We do this rather than a standard try/catch/re-throw to handle +// generic throwables. boolean success = false; try { while (records.hasNext()) { @@ -147,8 +150,19 @@ public class UnsafeShuffleWriterK, V extends ShuffleWriterK, V { closeAndWriteOutput(); success = true; } finally { - if (!success) { -sorter.cleanupAfterError(); + if (sorter != null) { +try { + sorter.cleanupAfterError(); +} catch (Exception e) { + // Only throw this error if we won't be masking another + // error. + if (success) { +throw e; + } else { +logger.error(In addition to a failure during writing, we failed during + + cleanup., e); + } +} } } } http://git-wip-us.apache.org/repos/asf/spark/blob/0f92be5b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java -- diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java index 83d1091..10c3eed 100644 --- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java @@ -253,6 +253,23 @@ public class UnsafeShuffleWriterSuite { createWriter(false).stop(false); } + class PandaException extends RuntimeException { + } + + @Test(expected=PandaException.class) + public void writeFailurePropagates() throws Exception { +class BadRecords extends scala.collection.AbstractIteratorProduct2Object, Object { + @Override public boolean hasNext() { +throw new PandaException(); + } + @Override public Product2Object, Object next() { +return null; + } +} +final UnsafeShuffleWriterObject, Object writer = createWriter(true); +writer.write(new BadRecords()); + } + @Test public void writeEmptyIterator() throws Exception { final UnsafeShuffleWriterObject, Object writer = createWriter(true); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL] [DOCS] updated the documentation for explode
Repository: spark Updated Branches: refs/heads/master 0f92be5b5 - 4f7fbefb8 [SQL] [DOCS] updated the documentation for explode the syntax was incorrect in the example in explode Author: lockwobr lockw...@gmail.com Closes #6943 from lockwobr/master and squashes the following commits: 3d864d1 [lockwobr] updated the documentation for explode Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f7fbefb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f7fbefb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f7fbefb Branch: refs/heads/master Commit: 4f7fbefb8db56ecaab66bb0ac2ab124416fefe58 Parents: 0f92be5 Author: lockwobr lockw...@gmail.com Authored: Wed Jun 24 02:48:56 2015 +0900 Committer: Kousuke Saruta saru...@oss.nttdata.co.jp Committed: Wed Jun 24 02:48:56 2015 +0900 -- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f7fbefb/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 492a332..f3f0f53 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1049,7 +1049,7 @@ class DataFrame private[sql]( * columns of the input row are implicitly joined with each value that is output by the function. * * {{{ - * df.explode(words, word)(words: String = words.split( )) + * df.explode(words, word){words: String = words.split( )} * }}} * @group dfops * @since 1.3.0 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8265] [MLLIB] [PYSPARK] Add LinearDataGenerator to pyspark.mllib.utils
Repository: spark Updated Branches: refs/heads/master 2bdd0 - f2022fa0d [SPARK-8265] [MLLIB] [PYSPARK] Add LinearDataGenerator to pyspark.mllib.utils It is useful to generate linear data for easy testing of linear models and in general. Scala already has it. This is just a wrapper around the Scala code. Author: MechCoder manojkumarsivaraj...@gmail.com Closes #6715 from MechCoder/generate_linear_input and squashes the following commits: 6182884 [MechCoder] Minor changes 8bda047 [MechCoder] Minor style fixes 0f1053c [MechCoder] [SPARK-8265] Add LinearDataGenerator to pyspark.mllib.utils Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2022fa0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2022fa0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2022fa0 Branch: refs/heads/master Commit: f2022fa0d375c804eca7803e172543b23ecbb9b7 Parents: 2bd Author: MechCoder manojkumarsivaraj...@gmail.com Authored: Tue Jun 23 12:43:32 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Jun 23 12:43:32 2015 -0700 -- .../spark/mllib/api/python/PythonMLLibAPI.scala | 32 +- python/pyspark/mllib/tests.py | 22 ++-- python/pyspark/mllib/util.py| 35 3 files changed, 86 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f2022fa0/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index f9a271f..c4bea7c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -51,6 +51,7 @@ import org.apache.spark.mllib.tree.loss.Losses import org.apache.spark.mllib.tree.model.{DecisionTreeModel, GradientBoostedTreesModel, RandomForestModel} import org.apache.spark.mllib.tree.{DecisionTree, GradientBoostedTrees, RandomForest} import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.mllib.util.LinearDataGenerator import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel @@ -972,7 +973,7 @@ private[python] class PythonMLLibAPI extends Serializable { def estimateKernelDensity( sample: JavaRDD[Double], bandwidth: Double, points: java.util.ArrayList[Double]): Array[Double] = { -return new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate( +new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate( points.asScala.toArray) } @@ -991,6 +992,35 @@ private[python] class PythonMLLibAPI extends Serializable { List[AnyRef](model.clusterCenters, Vectors.dense(model.clusterWeights)).asJava } + /** + * Wrapper around the generateLinearInput method of LinearDataGenerator. + */ + def generateLinearInputWrapper( + intercept: Double, + weights: JList[Double], + xMean: JList[Double], + xVariance: JList[Double], + nPoints: Int, + seed: Int, + eps: Double): Array[LabeledPoint] = { +LinearDataGenerator.generateLinearInput( + intercept, weights.asScala.toArray, xMean.asScala.toArray, + xVariance.asScala.toArray, nPoints, seed, eps).toArray + } + + /** + * Wrapper around the generateLinearRDD method of LinearDataGenerator. + */ + def generateLinearRDDWrapper( + sc: JavaSparkContext, + nexamples: Int, + nfeatures: Int, + eps: Double, + nparts: Int, + intercept: Double): JavaRDD[LabeledPoint] = { +LinearDataGenerator.generateLinearRDD( + sc, nexamples, nfeatures, eps, nparts, intercept) + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/f2022fa0/python/pyspark/mllib/tests.py -- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index c8d61b9..509faa1 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -49,8 +49,8 @@ from pyspark.mllib.random import RandomRDDs from pyspark.mllib.stat import Statistics from pyspark.mllib.feature import Word2Vec from pyspark.mllib.feature import IDF -from pyspark.mllib.feature import StandardScaler -from pyspark.mllib.feature import ElementwiseProduct +from pyspark.mllib.feature import StandardScaler, ElementwiseProduct +from pyspark.mllib.util import LinearDataGenerator from pyspark.serializers import PickleSerializer from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext
spark git commit: [SPARK-7888] Be able to disable intercept in linear regression in ml package
Repository: spark Updated Branches: refs/heads/master 6f4cadf5e - 2bdd0 [SPARK-7888] Be able to disable intercept in linear regression in ml package Author: Holden Karau hol...@pigscanfly.ca Closes #6927 from holdenk/SPARK-7888-Be-able-to-disable-intercept-in-Linear-Regression-in-ML-package and squashes the following commits: 0ad384c [Holden Karau] Add MiMa excludes 4016fac [Holden Karau] Switch to wild card import, remove extra blank lines ae5baa8 [Holden Karau] CR feedback, move the fitIntercept down rather than changing ymean and etc above f34971c [Holden Karau] Fix some more long lines 319bd3f [Holden Karau] Fix long lines 3bb9ee1 [Holden Karau] Update the regression suite tests 7015b9f [Holden Karau] Our code performs the same with R, except we need more than one data point but that seems reasonable 0b0c8c0 [Holden Karau] fix the issue with the sample R code e2140ba [Holden Karau] Add a test, it fails! 5e84a0b [Holden Karau] Write out thoughts and use the correct trait 91ffc0a [Holden Karau] more murh 006246c [Holden Karau] murp? Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2bdd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2bdd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2bdd Branch: refs/heads/master Commit: 2bdd0b8deb9ad8d43fec792e60e3d0c4de75 Parents: 6f4cadf Author: Holden Karau hol...@pigscanfly.ca Authored: Tue Jun 23 12:42:17 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Jun 23 12:42:17 2015 -0700 -- .../spark/ml/regression/LinearRegression.scala | 30 +++- .../ml/regression/LinearRegressionSuite.scala | 149 ++- project/MimaExcludes.scala | 5 + 3 files changed, 172 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2bdd/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 0130654..1b1d729 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -26,7 +26,7 @@ import org.apache.spark.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol} +import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS._ @@ -41,7 +41,8 @@ import org.apache.spark.util.StatCounter * Params for linear regression. */ private[regression] trait LinearRegressionParams extends PredictorParams - with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol +with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol +with HasFitIntercept /** * :: Experimental :: @@ -73,6 +74,14 @@ class LinearRegression(override val uid: String) setDefault(regParam - 0.0) /** + * Set if we should fit the intercept + * Default is true. + * @group setParam + */ + def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) + setDefault(fitIntercept - true) + + /** * Set the ElasticNet mixing parameter. * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. * For 0 alpha 1, the penalty is a combination of L1 and L2. @@ -123,6 +132,7 @@ class LinearRegression(override val uid: String) val numFeatures = summarizer.mean.size val yMean = statCounter.mean val yStd = math.sqrt(statCounter.variance) +// look at glmnet5.m L761 maaaybe that has info // If the yStd is zero, then the intercept is yMean with zero weights; // as a result, training is not needed. @@ -142,7 +152,7 @@ class LinearRegression(override val uid: String) val effectiveL1RegParam = $(elasticNetParam) * effectiveRegParam val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam -val costFun = new LeastSquaresCostFun(instances, yStd, yMean, +val costFun = new LeastSquaresCostFun(instances, yStd, yMean, $(fitIntercept), featuresStd, featuresMean, effectiveL2RegParam) val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { @@ -180,7 +190,7 @@ class LinearRegression(override val uid: String) // The intercept in R's GLMNET is computed using closed form after the coefficients are // converged. See the following discussion for
spark git commit: [SPARK-8111] [SPARKR] SparkR shell should display Spark logo and version banner on startup.
Repository: spark Updated Branches: refs/heads/master f2022fa0d - f2fb0285a [SPARK-8111] [SPARKR] SparkR shell should display Spark logo and version banner on startup. spark version is taken from the environment variable SPARK_VERSION Author: Alok Singh singhal@Aloks-MacBook-Pro.local Author: Alok Singh sing...@aloks-mbp.usca.ibm.com Closes #6944 from aloknsingh/aloknsingh_spark_jiras and squashes the following commits: ed607bd [Alok Singh] [SPARK-8111][SparkR] As per suggestion, 1) using the version from sparkContext rather than the Sys.env. 2) change Welcome to SparkR! to Welcome to followed by Spark logo and version acd5b85 [Alok Singh] fix the jira SPARK-8111 to add the spark version and logo. Currently spark version is taken from the environment variable SPARK_VERSION Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2fb0285 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2fb0285 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2fb0285 Branch: refs/heads/master Commit: f2fb0285ab6d4225c5350f109dea6c1c017bb491 Parents: f2022fa Author: Alok Singh singhal@Aloks-MacBook-Pro.local Authored: Tue Jun 23 12:47:55 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Tue Jun 23 12:47:55 2015 -0700 -- R/pkg/inst/profile/shell.R | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f2fb0285/R/pkg/inst/profile/shell.R -- diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R index 773b6ec..7189f1a 100644 --- a/R/pkg/inst/profile/shell.R +++ b/R/pkg/inst/profile/shell.R @@ -27,7 +27,21 @@ sc - SparkR::sparkR.init() assign(sc, sc, envir=.GlobalEnv) sqlContext - SparkR::sparkRSQL.init(sc) + sparkVer - SparkR:::callJMethod(sc, version) assign(sqlContext, sqlContext, envir=.GlobalEnv) - cat(\n Welcome to SparkR!) + cat(\n Welcome to) + cat(\n) + cat( __, \n) + cat( / __/__ ___ _/ /__, \n) + cat( _\\ \\/ _ \\/ _ `/ __/ '_/, \n) + cat( /___/ .__/\\_,_/_/ /_/\\_\\) + if (nchar(sparkVer) == 0) { +cat(\n) + } else { +cat( version , sparkVer, \n) + } + cat(/_/, \n) + cat(\n) + cat(\n Spark context is available as sc, SQL context is available as sqlContext\n) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8432] [SQL] fix hashCode() and equals() of BinaryType in Row
Repository: spark Updated Branches: refs/heads/master 7b1450b66 - 6f4cadf5e [SPARK-8432] [SQL] fix hashCode() and equals() of BinaryType in Row Also added more tests in LiteralExpressionSuite Author: Davies Liu dav...@databricks.com Closes #6876 from davies/fix_hashcode and squashes the following commits: 429c2c0 [Davies Liu] Merge branch 'master' of github.com:apache/spark into fix_hashcode 32d9811 [Davies Liu] fix test a0626ed [Davies Liu] Merge branch 'master' of github.com:apache/spark into fix_hashcode 89c2432 [Davies Liu] fix style bd20780 [Davies Liu] check with catalyst types 41caec6 [Davies Liu] change for to while d96929b [Davies Liu] address comment 6ad2a90 [Davies Liu] fix style 5819d33 [Davies Liu] unify equals() and hashCode() 0fff25d [Davies Liu] fix style 53c38b1 [Davies Liu] fix hashCode() and equals() of BinaryType in Row Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f4cadf5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f4cadf5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f4cadf5 Branch: refs/heads/master Commit: 6f4cadf5ee81467d077febc53d36571dd232295d Parents: 7b1450b Author: Davies Liu dav...@databricks.com Authored: Tue Jun 23 11:55:47 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Tue Jun 23 11:55:47 2015 -0700 -- .../main/java/org/apache/spark/sql/BaseRow.java | 21 -- .../main/scala/org/apache/spark/sql/Row.scala | 32 -- .../apache/spark/sql/catalyst/InternalRow.scala | 67 +++- .../codegen/GenerateProjection.scala| 1 + .../spark/sql/catalyst/expressions/rows.scala | 52 --- .../expressions/ExpressionEvalHelper.scala | 27 ++-- .../expressions/LiteralExpressionSuite.scala| 61 +++--- .../expressions/StringFunctionsSuite.scala | 5 +- .../apache/spark/unsafe/types/UTF8String.java | 6 +- .../spark/unsafe/types/UTF8StringSuite.java | 2 - 10 files changed, 139 insertions(+), 135 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f4cadf5/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java -- diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java index 611e02d..6a2356f 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java @@ -155,27 +155,6 @@ public abstract class BaseRow extends InternalRow { throw new UnsupportedOperationException(); } - /** - * A generic version of Row.equals(Row), which is used for tests. - */ - @Override - public boolean equals(Object other) { -if (other instanceof Row) { - Row row = (Row) other; - int n = size(); - if (n != row.size()) { -return false; - } - for (int i = 0; i n; i ++) { -if (isNullAt(i) != row.isNullAt(i) || (!isNullAt(i) !get(i).equals(row.get(i { - return false; -} - } - return true; -} -return false; - } - @Override public InternalRow copy() { final int n = size(); http://git-wip-us.apache.org/repos/asf/spark/blob/6f4cadf5/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index 8aaf5d7..e99d5c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql -import scala.util.hashing.MurmurHash3 - import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType @@ -365,36 +363,6 @@ trait Row extends Serializable { false } - override def equals(that: Any): Boolean = that match { -case null = false -case that: Row = - if (this.length != that.length) { -return false - } - var i = 0 - val len = this.length - while (i len) { -if (apply(i) != that.apply(i)) { - return false -} -i += 1 - } - true -case _ = false - } - - override def hashCode: Int = { -// Using Scala's Seq hash code implementation. -var n = 0 -var h = MurmurHash3.seqSeed -val len = length -while (n len) { - h = MurmurHash3.mix(h, apply(n).##) - n += 1 -} -MurmurHash3.finalizeHash(h, n) - } - /* -- utility methods for Scala -- */ /**
spark git commit: [DOC] [SQL] Addes Hive metastore Parquet table conversion section
Repository: spark Updated Branches: refs/heads/master a8031183a - d96d7b557 [DOC] [SQL] Addes Hive metastore Parquet table conversion section This PR adds a section about Hive metastore Parquet table conversion. It documents: 1. Schema reconciliation rules introduced in #5214 (see [this comment] [1] in #5188) 2. Metadata refreshing requirement introduced in #5339 [1]: https://github.com/apache/spark/pull/5188#issuecomment-86531248 Author: Cheng Lian l...@databricks.com Closes #5348 from liancheng/sql-doc-parquet-conversion and squashes the following commits: 42ae0d0 [Cheng Lian] Adds Python `refreshTable` snippet 4c9847d [Cheng Lian] Resorts to SQL for Python metadata refreshing snippet 756e660 [Cheng Lian] Adds Python snippet for metadata refreshing 50675db [Cheng Lian] Addes Hive metastore Parquet table conversion section Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d96d7b55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d96d7b55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d96d7b55 Branch: refs/heads/master Commit: d96d7b55746cf034e3935ec4b22614a99e48c498 Parents: a803118 Author: Cheng Lian l...@databricks.com Authored: Tue Jun 23 14:19:21 2015 -0700 Committer: Cheng Lian l...@databricks.com Committed: Tue Jun 23 14:19:21 2015 -0700 -- docs/sql-programming-guide.md | 94 +++--- 1 file changed, 88 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d96d7b55/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 26c036f..9107c9b 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -22,7 +22,7 @@ The DataFrame API is available in [Scala](api/scala/index.html#org.apache.spark. All of the examples on this page use sample data included in the Spark distribution and can be run in the `spark-shell`, `pyspark` shell, or `sparkR` shell. -## Starting Point: `SQLContext` +## Starting Point: SQLContext div class=codetabs div data-lang=scala markdown=1 @@ -1036,6 +1036,15 @@ for (teenName in collect(teenNames)) { /div +div data-lang=python markdown=1 + +{% highlight python %} +# sqlContext is an existing HiveContext +sqlContext.sql(REFRESH TABLE my_table) +{% endhighlight %} + +/div + div data-lang=sql markdown=1 {% highlight sql %} @@ -1054,7 +1063,7 @@ SELECT * FROM parquetTable /div -### Partition discovery +### Partition Discovery Table partitioning is a common optimization approach used in systems like Hive. In a partitioned table, data are usually stored in different directories, with partitioning column values encoded in @@ -1108,7 +1117,7 @@ can be configured by `spark.sql.sources.partitionColumnTypeInference.enabled`, w `true`. When type inference is disabled, string type will be used for the partitioning columns. -### Schema merging +### Schema Merging Like ProtocolBuffer, Avro, and Thrift, Parquet also supports schema evolution. Users can start with a simple schema, and gradually add more columns to the schema as needed. In this way, users may end @@ -1208,6 +1217,79 @@ printSchema(df3) /div +### Hive metastore Parquet table conversion + +When reading from and writing to Hive metastore Parquet tables, Spark SQL will try to use its own +Parquet support instead of Hive SerDe for better performance. This behavior is controlled by the +`spark.sql.hive.convertMetastoreParquet` configuration, and is turned on by default. + + Hive/Parquet Schema Reconciliation + +There are two key differences between Hive and Parquet from the perspective of table schema +processing. + +1. Hive is case insensitive, while Parquet is not +1. Hive considers all columns nullable, while nullability in Parquet is significant + +Due to this reason, we must reconcile Hive metastore schema with Parquet schema when converting a +Hive metastore Parquet table to a Spark SQL Parquet table. The reconciliation rules are: + +1. Fields that have the same name in both schema must have the same data type regardless of + nullability. The reconciled field should have the data type of the Parquet side, so that + nullability is respected. + +1. The reconciled schema contains exactly those fields defined in Hive metastore schema. + + - Any fields that only appear in the Parquet schema are dropped in the reconciled schema. + - Any fileds that only appear in the Hive metastore schema are added as nullable field in the + reconciled schema. + + Metadata Refreshing + +Spark SQL caches Parquet metadata for better performance. When Hive metastore Parquet table +conversion is enabled, metadata of those
spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector
Repository: spark Updated Branches: refs/heads/branch-1.1 36eed2f9e - faa35ca05 [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector fix LabeledPoint parser when there is a whitespace between label and features vector, e.g. (y, [x1, x2, x3]) Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Closes #6954 from fe2s/SPARK-8525 and squashes the following commits: 0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position (cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/faa35ca0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/faa35ca0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/faa35ca0 Branch: refs/heads/branch-1.1 Commit: faa35ca05baba40e82bfeb79d4c9f47a5ef62bcc Parents: 36eed2f Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Authored: Tue Jun 23 13:12:19 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Jun 23 13:18:05 2015 -0700 -- .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++ .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 + .../org/apache/spark/mllib/util/NumericParserSuite.scala | 7 +++ 3 files changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/faa35ca0/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala index f7cba6c..629b0af 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala @@ -98,6 +98,8 @@ private[mllib] object NumericParser { } } else if (token == )) { parsing = false + } else if (token.trim.isEmpty){ + // ignore whitespaces between delim chars, e.g. , [ } else { // expecting a number items.append(parseDouble(token)) http://git-wip-us.apache.org/repos/asf/spark/blob/faa35ca0/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala index 110c44a..fc08bac 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala @@ -32,6 +32,11 @@ class LabeledPointSuite extends FunSuite { } } + test(parse labeled points with whitespaces) { +val point = LabeledPoint.parse((0.0, [1.0, 2.0])) +assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) + } + test(parse labeled points with v0.9 format) { val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0) assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) http://git-wip-us.apache.org/repos/asf/spark/blob/faa35ca0/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala index f68fb95..5027311 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala @@ -39,4 +39,11 @@ class NumericParserSuite extends FunSuite { } } } + + test(parser with whitespaces) { +val s = (0.0, [1.0, 2.0]) +val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] +assert(parsed(0).asInstanceOf[Double] === 0.0) +assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector
Repository: spark Updated Branches: refs/heads/branch-1.4 27693e175 - 8d6e3636e [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector fix LabeledPoint parser when there is a whitespace between label and features vector, e.g. (y, [x1, x2, x3]) Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Closes #6954 from fe2s/SPARK-8525 and squashes the following commits: 0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position (cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d6e3636 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d6e3636 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d6e3636 Branch: refs/heads/branch-1.4 Commit: 8d6e3636e9599db0e45e9e6f4e71a08cdc91e11f Parents: 27693e1 Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Authored: Tue Jun 23 13:12:19 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Jun 23 13:15:27 2015 -0700 -- .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++ .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 + .../org/apache/spark/mllib/util/NumericParserSuite.scala | 7 +++ 3 files changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8d6e3636/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala index 308f7f3..a841c5c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala @@ -98,6 +98,8 @@ private[mllib] object NumericParser { } } else if (token == )) { parsing = false + } else if (token.trim.isEmpty){ + // ignore whitespaces between delim chars, e.g. , [ } else { // expecting a number items.append(parseDouble(token)) http://git-wip-us.apache.org/repos/asf/spark/blob/8d6e3636/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala index d8364a0..f8d0af8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala @@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite { } } + test(parse labeled points with whitespaces) { +val point = LabeledPoint.parse((0.0, [1.0, 2.0])) +assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) + } + test(parse labeled points with v0.9 format) { val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0) assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) http://git-wip-us.apache.org/repos/asf/spark/blob/8d6e3636/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala index 8dcb9ba..fa4f74d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala @@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite { } } } + + test(parser with whitespaces) { +val s = (0.0, [1.0, 2.0]) +val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] +assert(parsed(0).asInstanceOf[Double] === 0.0) +assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector
Repository: spark Updated Branches: refs/heads/master f2fb0285a - a8031183a [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector fix LabeledPoint parser when there is a whitespace between label and features vector, e.g. (y, [x1, x2, x3]) Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Closes #6954 from fe2s/SPARK-8525 and squashes the following commits: 0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a8031183 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a8031183 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a8031183 Branch: refs/heads/master Commit: a8031183aff2e23de9204ddfc7e7f5edbf052a7e Parents: f2fb028 Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Authored: Tue Jun 23 13:12:19 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Jun 23 13:12:19 2015 -0700 -- .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++ .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 + .../org/apache/spark/mllib/util/NumericParserSuite.scala | 7 +++ 3 files changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a8031183/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala index 308f7f3..a841c5c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala @@ -98,6 +98,8 @@ private[mllib] object NumericParser { } } else if (token == )) { parsing = false + } else if (token.trim.isEmpty){ + // ignore whitespaces between delim chars, e.g. , [ } else { // expecting a number items.append(parseDouble(token)) http://git-wip-us.apache.org/repos/asf/spark/blob/a8031183/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala index d8364a0..f8d0af8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala @@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite { } } + test(parse labeled points with whitespaces) { +val point = LabeledPoint.parse((0.0, [1.0, 2.0])) +assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) + } + test(parse labeled points with v0.9 format) { val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0) assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) http://git-wip-us.apache.org/repos/asf/spark/blob/a8031183/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala index 8dcb9ba..fa4f74d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala @@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite { } } } + + test(parser with whitespaces) { +val s = (0.0, [1.0, 2.0]) +val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] +assert(parsed(0).asInstanceOf[Double] === 0.0) +assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector
Repository: spark Updated Branches: refs/heads/branch-1.3 716dcf631 - 88e303f6e [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector fix LabeledPoint parser when there is a whitespace between label and features vector, e.g. (y, [x1, x2, x3]) Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Closes #6954 from fe2s/SPARK-8525 and squashes the following commits: 0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position (cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/88e303f6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/88e303f6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/88e303f6 Branch: refs/heads/branch-1.3 Commit: 88e303f6e991cdbed519bf11820e1280057581e9 Parents: 716dcf6 Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Authored: Tue Jun 23 13:12:19 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Jun 23 13:17:27 2015 -0700 -- .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++ .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 + .../org/apache/spark/mllib/util/NumericParserSuite.scala | 7 +++ 3 files changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/88e303f6/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala index 308f7f3..a841c5c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala @@ -98,6 +98,8 @@ private[mllib] object NumericParser { } } else if (token == )) { parsing = false + } else if (token.trim.isEmpty){ + // ignore whitespaces between delim chars, e.g. , [ } else { // expecting a number items.append(parseDouble(token)) http://git-wip-us.apache.org/repos/asf/spark/blob/88e303f6/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala index d8364a0..f8d0af8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala @@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite { } } + test(parse labeled points with whitespaces) { +val point = LabeledPoint.parse((0.0, [1.0, 2.0])) +assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) + } + test(parse labeled points with v0.9 format) { val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0) assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) http://git-wip-us.apache.org/repos/asf/spark/blob/88e303f6/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala index 8dcb9ba..fa4f74d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala @@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite { } } } + + test(parser with whitespaces) { +val s = (0.0, [1.0, 2.0]) +val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] +assert(parsed(0).asInstanceOf[Double] === 0.0) +assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector
Repository: spark Updated Branches: refs/heads/branch-1.2 30789f6ef - 24c2c58c2 [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector fix LabeledPoint parser when there is a whitespace between label and features vector, e.g. (y, [x1, x2, x3]) Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Closes #6954 from fe2s/SPARK-8525 and squashes the following commits: 0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep on commons-lang c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when there is a whitespace on specific position (cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/24c2c58c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/24c2c58c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/24c2c58c Branch: refs/heads/branch-1.2 Commit: 24c2c58c28dfcc32d3d4aa2ea089d8bdaaa7ecf3 Parents: 30789f6 Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com Authored: Tue Jun 23 13:12:19 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Jun 23 13:17:43 2015 -0700 -- .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++ .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 + .../org/apache/spark/mllib/util/NumericParserSuite.scala | 7 +++ 3 files changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/24c2c58c/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala index f7cba6c..629b0af 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala @@ -98,6 +98,8 @@ private[mllib] object NumericParser { } } else if (token == )) { parsing = false + } else if (token.trim.isEmpty){ + // ignore whitespaces between delim chars, e.g. , [ } else { // expecting a number items.append(parseDouble(token)) http://git-wip-us.apache.org/repos/asf/spark/blob/24c2c58c/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala index 110c44a..fc08bac 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala @@ -32,6 +32,11 @@ class LabeledPointSuite extends FunSuite { } } + test(parse labeled points with whitespaces) { +val point = LabeledPoint.parse((0.0, [1.0, 2.0])) +assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) + } + test(parse labeled points with v0.9 format) { val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0) assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) http://git-wip-us.apache.org/repos/asf/spark/blob/24c2c58c/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala index f68fb95..5027311 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala @@ -39,4 +39,11 @@ class NumericParserSuite extends FunSuite { } } } + + test(parser with whitespaces) { +val s = (0.0, [1.0, 2.0]) +val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] +assert(parsed(0).asInstanceOf[Double] === 0.0) +assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc1 [created] 60e08e507 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v1.4.1-rc1
Repository: spark Updated Branches: refs/heads/branch-1.4 13f7b0a91 - eafbe1345 Preparing Spark release v1.4.1-rc1 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60e08e50 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60e08e50 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60e08e50 Branch: refs/heads/branch-1.4 Commit: 60e08e50751fe3929156de956d62faea79f5b801 Parents: 13f7b0a Author: Patrick Wendell pwend...@gmail.com Authored: Tue Jun 23 19:48:39 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Jun 23 19:48:39 2015 -0700 -- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml| 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml| 2 +- extras/kinesis-asl/pom.xml| 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib/pom.xml | 2 +- network/common/pom.xml| 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml| 2 +- yarn/pom.xml | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 228db59..ba233e7 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.2-SNAPSHOT/version +version1.4.1/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index ce791a6..c5e9183 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.2-SNAPSHOT/version +version1.4.1/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 176ea9b..f0d236d 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.2-SNAPSHOT/version +version1.4.1/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 877c2fb..e9a9cc2 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.2-SNAPSHOT/version +version1.4.1/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index ad431fa..7eae7a7 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.2-SNAPSHOT/version +version1.4.1/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 9789435..b3ad09a 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId
[2/2] spark git commit: Preparing development version 1.4.2-SNAPSHOT
Preparing development version 1.4.2-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eafbe134 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eafbe134 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eafbe134 Branch: refs/heads/branch-1.4 Commit: eafbe1345921e90f2099cff809e2d7fe96f197e5 Parents: 60e08e5 Author: Patrick Wendell pwend...@gmail.com Authored: Tue Jun 23 19:48:44 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Jun 23 19:48:44 2015 -0700 -- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml| 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml| 2 +- extras/kinesis-asl/pom.xml| 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib/pom.xml | 2 +- network/common/pom.xml| 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml| 2 +- yarn/pom.xml | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index ba233e7..228db59 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index c5e9183..ce791a6 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index f0d236d..176ea9b 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index e9a9cc2..877c2fb 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 7eae7a7..ad431fa 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index b3ad09a..9789435 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +
spark git commit: [SPARK-6749] [SQL] Make metastore client robust to underlying socket connection loss
Repository: spark Updated Branches: refs/heads/master a458efc66 - 50c3a86f4 [SPARK-6749] [SQL] Make metastore client robust to underlying socket connection loss This works around a bug in the underlying RetryingMetaStoreClient (HIVE-10384) by refreshing the metastore client on thrift exceptions. We attempt to emulate the proper hive behavior by retrying only as configured by hiveconf. Author: Eric Liang e...@databricks.com Closes #6912 from ericl/spark-6749 and squashes the following commits: 2d54b55 [Eric Liang] use conf from state 0e3a74e [Eric Liang] use shim properly 980b3e5 [Eric Liang] Fix conf parsing hive 0.14 conf. 92459b6 [Eric Liang] Work around RetryingMetaStoreClient bug Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/50c3a86f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/50c3a86f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/50c3a86f Branch: refs/heads/master Commit: 50c3a86f42d7dfd1acbda65c1e5afbd3db1406df Parents: a458efc Author: Eric Liang e...@databricks.com Authored: Tue Jun 23 22:27:17 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Jun 23 22:27:17 2015 -0700 -- .../spark/sql/hive/client/ClientWrapper.scala | 55 +++- .../apache/spark/sql/hive/client/HiveShim.scala | 19 +++ 2 files changed, 72 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/50c3a86f/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala index 42c2d4c..2f771d7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.client import java.io.{BufferedReader, InputStreamReader, File, PrintStream} import java.net.URI import java.util.{ArrayList = JArrayList, Map = JMap, List = JList, Set = JSet} +import javax.annotation.concurrent.GuardedBy import scala.collection.JavaConversions._ import scala.language.reflectiveCalls @@ -136,12 +137,62 @@ private[hive] class ClientWrapper( // TODO: should be a def?s // When we create this val client, the HiveConf of it (conf) is the one associated with state. - private val client = Hive.get(conf) + @GuardedBy(this) + private var client = Hive.get(conf) + + // We use hive's conf for compatibility. + private val retryLimit = conf.getIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES) + private val retryDelayMillis = shim.getMetastoreClientConnectRetryDelayMillis(conf) + + /** + * Runs `f` with multiple retries in case the hive metastore is temporarily unreachable. + */ + private def retryLocked[A](f: = A): A = synchronized { +// Hive sometimes retries internally, so set a deadline to avoid compounding delays. +val deadline = System.nanoTime + (retryLimit * retryDelayMillis * 1e6).toLong +var numTries = 0 +var caughtException: Exception = null +do { + numTries += 1 + try { +return f + } catch { +case e: Exception if causedByThrift(e) = + caughtException = e + logWarning( +HiveClientWrapper got thrift exception, destroying client and retrying + + s(${retryLimit - numTries} tries remaining), e) + Thread.sleep(retryDelayMillis) + try { +client = Hive.get(state.getConf, true) + } catch { +case e: Exception if causedByThrift(e) = + logWarning(Failed to refresh hive client, will retry., e) + } + } +} while (numTries = retryLimit System.nanoTime deadline) +if (System.nanoTime deadline) { + logWarning(Deadline exceeded) +} +throw caughtException + } + + private def causedByThrift(e: Throwable): Boolean = { +var target = e +while (target != null) { + val msg = target.getMessage() + if (msg != null msg.matches((?s).*(TApplication|TProtocol|TTransport)Exception.*)) { +return true + } + target = target.getCause() +} +false + } /** * Runs `f` with ThreadLocal session state and classloaders configured for this version of hive. */ - private def withHiveState[A](f: = A): A = synchronized { + private def withHiveState[A](f: = A): A = retryLocked { val original = Thread.currentThread().getContextClassLoader // Set the thread local metastore client to the client associated with this ClientWrapper. Hive.set(client)
spark git commit: [SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Si…
Repository: spark Updated Branches: refs/heads/master 31bd30687 - 9b618fb0d [SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Si⦠â¦nk. Also bump Flume version to 1.6.0 Author: Hari Shreedharan hshreedha...@apache.org Closes #6910 from harishreedharan/remove-commons-lang3 and squashes the following commits: 9875f7d [Hari Shreedharan] Revert back to Flume 1.4.0 ca35eb0 [Hari Shreedharan] [SPARK-8483][Streaming] Remove commons-lang3 dependency from Flume Sink. Also bump Flume version to 1.6.0 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9b618fb0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9b618fb0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9b618fb0 Branch: refs/heads/master Commit: 9b618fb0d2536121d2784ff5341d74723e810fc5 Parents: 31bd306 Author: Hari Shreedharan hshreedha...@apache.org Authored: Mon Jun 22 23:34:17 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jun 22 23:34:17 2015 -0700 -- external/flume-sink/pom.xml | 4 .../spark/streaming/flume/sink/SparkAvroCallbackHandler.scala| 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9b618fb0/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 7a7dccc..0664cfb 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -36,10 +36,6 @@ dependencies dependency - groupIdorg.apache.commons/groupId - artifactIdcommons-lang3/artifactId -/dependency -dependency groupIdorg.apache.flume/groupId artifactIdflume-ng-sdk/artifactId exclusions http://git-wip-us.apache.org/repos/asf/spark/blob/9b618fb0/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala -- diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala index dc2a4ab..719fca0 100644 --- a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala +++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala @@ -16,13 +16,13 @@ */ package org.apache.spark.streaming.flume.sink +import java.util.UUID import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong import scala.collection.mutable import org.apache.flume.Channel -import org.apache.commons.lang3.RandomStringUtils /** * Class that implements the SparkFlumeProtocol, that is used by the Avro Netty Server to process @@ -53,7 +53,7 @@ private[flume] class SparkAvroCallbackHandler(val threads: Int, val channel: Cha // Since the new txn may not have the same sequence number we must guard against accidentally // committing a new transaction. To reduce the probability of that happening a random string is // prepended to the sequence number. Does not change for life of sink - private val seqBase = RandomStringUtils.randomAlphanumeric(8) + private val seqBase = UUID.randomUUID().toString.substring(0, 8) private val seqCounter = new AtomicLong(0) // Protected by `sequenceNumberToProcessor` - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8359] [SQL] Fix incorrect decimal precision after multiplication
Repository: spark Updated Branches: refs/heads/master d4f633514 - 31bd30687 [SPARK-8359] [SQL] Fix incorrect decimal precision after multiplication JIRA: https://issues.apache.org/jira/browse/SPARK-8359 Author: Liang-Chi Hsieh vii...@gmail.com Closes #6814 from viirya/fix_decimal2 and squashes the following commits: 071a757 [Liang-Chi Hsieh] Remove maximum precision and use MathContext.UNLIMITED. df217d4 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into fix_decimal2 a43bfc3 [Liang-Chi Hsieh] Add MathContext with maximum supported precision. 72eeb3f [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into fix_decimal2 44c9348 [Liang-Chi Hsieh] Fix incorrect decimal precision after multiplication. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31bd3068 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31bd3068 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31bd3068 Branch: refs/heads/master Commit: 31bd30687bc29c0e457c37308d489ae2b6e5b72a Parents: d4f6335 Author: Liang-Chi Hsieh vii...@gmail.com Authored: Mon Jun 22 23:11:56 2015 -0700 Committer: Davies Liu dav...@databricks.com Committed: Mon Jun 22 23:11:56 2015 -0700 -- .../src/main/scala/org/apache/spark/sql/types/Decimal.scala| 6 -- .../org/apache/spark/sql/types/decimal/DecimalSuite.scala | 5 + 2 files changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/31bd3068/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index a85af9e..bd9823b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.types +import java.math.{MathContext, RoundingMode} + import org.apache.spark.annotation.DeveloperApi /** @@ -137,9 +139,9 @@ final class Decimal extends Ordered[Decimal] with Serializable { def toBigDecimal: BigDecimal = { if (decimalVal.ne(null)) { - decimalVal + decimalVal(MathContext.UNLIMITED) } else { - BigDecimal(longVal, _scale) + BigDecimal(longVal, _scale)(MathContext.UNLIMITED) } } http://git-wip-us.apache.org/repos/asf/spark/blob/31bd3068/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala index 4c0365c..ccc29c0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala @@ -162,4 +162,9 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester { assert(new Decimal().set(100L, 10, 0).toUnscaledLong === 100L) assert(Decimal(Long.MaxValue, 100, 0).toUnscaledLong === Long.MaxValue) } + + test(accurate precision after multiplication) { +val decimal = (Decimal(Long.MaxValue, 38, 0) * Decimal(Long.MaxValue, 38, 0)).toJavaBigDecimal +assert(decimal.unscaledValue.toString === 85070591730234615847396907784232501249) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8541] [PYSPARK] test the absolute error in approx doctests
Repository: spark Updated Branches: refs/heads/master 9b618fb0d - f0dcbe8a7 [SPARK-8541] [PYSPARK] test the absolute error in approx doctests A minor change but one which is (presumably) visible on the public api docs webpage. Author: Scott Taylor git...@megatron.me.uk Closes #6942 from megatron-me-uk/patch-3 and squashes the following commits: fbed000 [Scott Taylor] test the absolute error in approx doctests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f0dcbe8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f0dcbe8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f0dcbe8a Branch: refs/heads/master Commit: f0dcbe8a7c2de510b47a21eb45cde34777638758 Parents: 9b618fb Author: Scott Taylor git...@megatron.me.uk Authored: Mon Jun 22 23:37:56 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Mon Jun 22 23:37:56 2015 -0700 -- python/pyspark/rdd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f0dcbe8a/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 20c0bc9..1b64be2 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2198,7 +2198,7 @@ class RDD(object): rdd = sc.parallelize(range(1000), 10) r = sum(range(1000)) - (rdd.sumApprox(1000) - r) / r 0.05 + abs(rdd.sumApprox(1000) - r) / r 0.05 True jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd() @@ -2215,7 +2215,7 @@ class RDD(object): rdd = sc.parallelize(range(1000), 10) r = sum(range(1000)) / 1000.0 - (rdd.meanApprox(1000) - r) / r 0.05 + abs(rdd.meanApprox(1000) - r) / r 0.05 True jrdd = self.map(float)._to_java_object_rdd() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Sink
Repository: spark Updated Branches: refs/heads/branch-1.4 d0943afbc - 929479675 [SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Sink Author: Hari Shreedharan hshreedha...@apache.org Closes #6910 from harishreedharan/remove-commons-lang3 and squashes the following commits: 9875f7d [Hari Shreedharan] Revert back to Flume 1.4.0 ca35eb0 [Hari Shreedharan] [SPARK-8483][Streaming] Remove commons-lang3 dependency from Flume Sink. Also bump Flume version to 1.6.0 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92947967 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92947967 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92947967 Branch: refs/heads/branch-1.4 Commit: 9294796750f9c9330ab113f025763e68b624abc9 Parents: d0943af Author: Hari Shreedharan hshreedha...@apache.org Authored: Mon Jun 22 23:34:17 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jun 22 23:41:35 2015 -0700 -- external/flume-sink/pom.xml | 4 .../spark/streaming/flume/sink/SparkAvroCallbackHandler.scala| 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92947967/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 37f2b1b..ad431fa 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -36,10 +36,6 @@ dependencies dependency - groupIdorg.apache.commons/groupId - artifactIdcommons-lang3/artifactId -/dependency -dependency groupIdorg.apache.flume/groupId artifactIdflume-ng-sdk/artifactId exclusions http://git-wip-us.apache.org/repos/asf/spark/blob/92947967/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala -- diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala index dc2a4ab..719fca0 100644 --- a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala +++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala @@ -16,13 +16,13 @@ */ package org.apache.spark.streaming.flume.sink +import java.util.UUID import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong import scala.collection.mutable import org.apache.flume.Channel -import org.apache.commons.lang3.RandomStringUtils /** * Class that implements the SparkFlumeProtocol, that is used by the Avro Netty Server to process @@ -53,7 +53,7 @@ private[flume] class SparkAvroCallbackHandler(val threads: Int, val channel: Cha // Since the new txn may not have the same sequence number we must guard against accidentally // committing a new transaction. To reduce the probability of that happening a random string is // prepended to the sequence number. Does not change for life of sink - private val seqBase = RandomStringUtils.randomAlphanumeric(8) + private val seqBase = UUID.randomUUID().toString.substring(0, 8) private val seqCounter = new AtomicLong(0) // Protected by `sequenceNumberToProcessor` - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8431] [SPARKR] Add in operator to DataFrame Column in SparkR
Repository: spark Updated Branches: refs/heads/master 164fe2aa4 - d4f633514 [SPARK-8431] [SPARKR] Add in operator to DataFrame Column in SparkR [[SPARK-8431] Add in operator to DataFrame Column in SparkR - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-8431) Author: Yu ISHIKAWA yuu.ishik...@gmail.com Closes #6941 from yu-iskw/SPARK-8431 and squashes the following commits: 1f64423 [Yu ISHIKAWA] Modify the comment f4309a7 [Yu ISHIKAWA] Make a `setMethod` for `%in%` be independent 6e37936 [Yu ISHIKAWA] Modify a variable name c196173 [Yu ISHIKAWA] [SPARK-8431][SparkR] Add in operator to DataFrame Column in SparkR Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4f63351 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4f63351 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4f63351 Branch: refs/heads/master Commit: d4f633514a393320c9ae64c00a75f702e6f58c67 Parents: 164fe2a Author: Yu ISHIKAWA yuu.ishik...@gmail.com Authored: Mon Jun 22 23:04:36 2015 -0700 Committer: Davies Liu dav...@databricks.com Committed: Mon Jun 22 23:04:36 2015 -0700 -- R/pkg/R/column.R | 16 R/pkg/inst/tests/test_sparkSQL.R | 10 ++ 2 files changed, 26 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d4f63351/R/pkg/R/column.R -- diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 80e92d3..8e4b0f5 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -210,6 +210,22 @@ setMethod(cast, } }) +#' Match a column with given values. +#' +#' @rdname column +#' @return a matched values as a result of comparing with given values. +#' \dontrun{ +#' filter(df, age in (10, 30)) +#' where(df, df$age %in% c(10, 30)) +#' } +setMethod(%in%, + signature(x = Column), + function(x, table) { +table - listToSeq(as.list(table)) +jc - callJMethod(x@jc, in, table) +return(column(jc)) + }) + #' Approx Count Distinct #' #' @rdname column http://git-wip-us.apache.org/repos/asf/spark/blob/d4f63351/R/pkg/inst/tests/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index fc7f3f0..417153d 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -693,6 +693,16 @@ test_that(filter() on a DataFrame, { filtered2 - where(df, df$name != Michael) expect_true(count(filtered2) == 2) expect_true(collect(filtered2)$age[2] == 19) + + # test suites for %in% + filtered3 - filter(df, age in (19)) + expect_equal(count(filtered3), 1) + filtered4 - filter(df, age in (19, 30)) + expect_equal(count(filtered4), 2) + filtered5 - where(df, df$age %in% c(19)) + expect_equal(count(filtered5), 1) + filtered6 - where(df, df$age %in% c(19, 30)) + expect_equal(count(filtered6), 2) }) test_that(join() on a DataFrame, { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8541] [PYSPARK] test the absolute error in approx doctests
Repository: spark Updated Branches: refs/heads/branch-1.4 22cc1ab66 - d0943afbc [SPARK-8541] [PYSPARK] test the absolute error in approx doctests A minor change but one which is (presumably) visible on the public api docs webpage. Author: Scott Taylor git...@megatron.me.uk Closes #6942 from megatron-me-uk/patch-3 and squashes the following commits: fbed000 [Scott Taylor] test the absolute error in approx doctests (cherry picked from commit f0dcbe8a7c2de510b47a21eb45cde34777638758) Signed-off-by: Josh Rosen joshro...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0943afb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0943afb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0943afb Branch: refs/heads/branch-1.4 Commit: d0943afbcffec5d8b668794dedc8d85fb10b0596 Parents: 22cc1ab Author: Scott Taylor git...@megatron.me.uk Authored: Mon Jun 22 23:37:56 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Mon Jun 22 23:38:21 2015 -0700 -- python/pyspark/rdd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d0943afb/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 20c0bc9..1b64be2 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2198,7 +2198,7 @@ class RDD(object): rdd = sc.parallelize(range(1000), 10) r = sum(range(1000)) - (rdd.sumApprox(1000) - r) / r 0.05 + abs(rdd.sumApprox(1000) - r) / r 0.05 True jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd() @@ -2215,7 +2215,7 @@ class RDD(object): rdd = sc.parallelize(range(1000), 10) r = sum(range(1000)) / 1000.0 - (rdd.meanApprox(1000) - r) / r 0.05 + abs(rdd.meanApprox(1000) - r) / r 0.05 True jrdd = self.map(float)._to_java_object_rdd() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8541] [PYSPARK] test the absolute error in approx doctests
Repository: spark Updated Branches: refs/heads/branch-1.3 45b4527e3 - 716dcf631 [SPARK-8541] [PYSPARK] test the absolute error in approx doctests A minor change but one which is (presumably) visible on the public api docs webpage. Author: Scott Taylor git...@megatron.me.uk Closes #6942 from megatron-me-uk/patch-3 and squashes the following commits: fbed000 [Scott Taylor] test the absolute error in approx doctests (cherry picked from commit f0dcbe8a7c2de510b47a21eb45cde34777638758) Signed-off-by: Josh Rosen joshro...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/716dcf63 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/716dcf63 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/716dcf63 Branch: refs/heads/branch-1.3 Commit: 716dcf631558920c080cb824dcd617789b9f96d5 Parents: 45b4527 Author: Scott Taylor git...@megatron.me.uk Authored: Mon Jun 22 23:37:56 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Mon Jun 22 23:39:39 2015 -0700 -- python/pyspark/rdd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/716dcf63/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index d80366a..bd18cb3 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2130,7 +2130,7 @@ class RDD(object): rdd = sc.parallelize(range(1000), 10) r = sum(xrange(1000)) - (rdd.sumApprox(1000) - r) / r 0.05 + abs(rdd.sumApprox(1000) - r) / r 0.05 True jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd() @@ -2147,7 +2147,7 @@ class RDD(object): rdd = sc.parallelize(range(1000), 10) r = sum(xrange(1000)) / 1000.0 - (rdd.meanApprox(1000) - r) / r 0.05 + abs(rdd.meanApprox(1000) - r) / r 0.05 True jrdd = self.map(float)._to_java_object_rdd() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8300] DataFrame hint for broadcast join.
Repository: spark Updated Branches: refs/heads/master f0dcbe8a7 - 6ceb16960 [SPARK-8300] DataFrame hint for broadcast join. Users can now do ```scala left.join(broadcast(right), joinKey) ``` to give the query planner a hint that right DataFrame is small and should be broadcasted. Author: Reynold Xin r...@databricks.com Closes #6751 from rxin/broadcastjoin-hint and squashes the following commits: 953eec2 [Reynold Xin] Code review feedback. 88752d8 [Reynold Xin] Fixed import. 8187b88 [Reynold Xin] [SPARK-8300] DataFrame hint for broadcast join. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ceb1696 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ceb1696 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ceb1696 Branch: refs/heads/master Commit: 6ceb169608428a651d53c93bf73ca5ac53a6bde2 Parents: f0dcbe8 Author: Reynold Xin r...@databricks.com Authored: Tue Jun 23 01:50:31 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Tue Jun 23 01:50:31 2015 -0700 -- .../catalyst/plans/logical/basicOperators.scala | 8 +++ .../spark/sql/execution/SparkStrategies.scala | 25 +--- .../scala/org/apache/spark/sql/functions.scala | 17 + .../apache/spark/sql/DataFrameJoinSuite.scala | 17 + 4 files changed, 59 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6ceb1696/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala index f8e5916..7814e51 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala @@ -130,6 +130,14 @@ case class Join( } } +/** + * A hint for the optimizer that we should broadcast the `child` if used in a join operator. + */ +case class BroadcastHint(child: LogicalPlan) extends UnaryNode { + override def output: Seq[Attribute] = child.output +} + + case class Except(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output } http://git-wip-us.apache.org/repos/asf/spark/blob/6ceb1696/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 422992d..5c420eb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning._ import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation} import org.apache.spark.sql.execution.{DescribeCommand = RunnableDescribeCommand} @@ -53,6 +53,18 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } /** + * Matches a plan whose output should be small enough to be used in broadcast join. + */ + object CanBroadcast { +def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match { + case BroadcastHint(p) = Some(p) + case p if sqlContext.conf.autoBroadcastJoinThreshold 0 +p.statistics.sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold = Some(p) + case _ = None +} + } + + /** * Uses the ExtractEquiJoinKeys pattern to find joins where at least some of the predicates can be * evaluated by matching hash keys. * @@ -80,15 +92,11 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) -if sqlContext.conf.autoBroadcastJoinThreshold 0 - right.statistics.sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold = + case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =
spark git commit: Revert [SPARK-7157][SQL] add sampleBy to DataFrame
Repository: spark Updated Branches: refs/heads/master 0401cbaa8 - a458efc66 Revert [SPARK-7157][SQL] add sampleBy to DataFrame This reverts commit 0401cbaa8ee51c71f43604f338b65022a479da0a. The new test case on Jenkins is failing. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a458efc6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a458efc6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a458efc6 Branch: refs/heads/master Commit: a458efc66c31dc281af379b914bfa2b077ca6635 Parents: 0401cba Author: Reynold Xin r...@databricks.com Authored: Tue Jun 23 19:30:25 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Tue Jun 23 19:30:25 2015 -0700 -- python/pyspark/sql/dataframe.py | 40 .../spark/sql/DataFrameStatFunctions.scala | 24 .../apache/spark/sql/DataFrameStatSuite.scala | 12 +- 3 files changed, 2 insertions(+), 74 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a458efc6/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 213338d..152b873 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -448,41 +448,6 @@ class DataFrame(object): rdd = self._jdf.sample(withReplacement, fraction, long(seed)) return DataFrame(rdd, self.sql_ctx) -@since(1.5) -def sampleBy(self, col, fractions, seed=None): - -Returns a stratified sample without replacement based on the -fraction given on each stratum. - -:param col: column that defines strata -:param fractions: -sampling fraction for each stratum. If a stratum is not -specified, we treat its fraction as zero. -:param seed: random seed -:return: a new DataFrame that represents the stratified sample - - from pyspark.sql.functions import col - dataset = sqlContext.range(0, 100).select((col(id) % 3).alias(key)) - sampled = dataset.sampleBy(key, fractions={0: 0.1, 1: 0.2}, seed=0) - sampled.groupBy(key).count().orderBy(key).show() -+---+-+ -|key|count| -+---+-+ -| 0|5| -| 1|8| -+---+-+ - -if not isinstance(col, str): -raise ValueError(col must be a string, but got %r % type(col)) -if not isinstance(fractions, dict): -raise ValueError(fractions must be a dict but got %r % type(fractions)) -for k, v in fractions.items(): -if not isinstance(k, (float, int, long, basestring)): -raise ValueError(key must be float, int, long, or string, but got %r % type(k)) -fractions[k] = float(v) -seed = seed if seed is not None else random.randint(0, sys.maxsize) -return DataFrame(self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sql_ctx) - @since(1.4) def randomSplit(self, weights, seed=None): Randomly splits this :class:`DataFrame` with the provided weights. @@ -1357,11 +1322,6 @@ class DataFrameStatFunctions(object): freqItems.__doc__ = DataFrame.freqItems.__doc__ -def sampleBy(self, col, fractions, seed=None): -return self.df.sampleBy(col, fractions, seed) - -sampleBy.__doc__ = DataFrame.sampleBy.__doc__ - def _test(): import doctest http://git-wip-us.apache.org/repos/asf/spark/blob/a458efc6/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 955d287..edb9ed7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql -import java.util.UUID - import org.apache.spark.annotation.Experimental import org.apache.spark.sql.execution.stat._ @@ -165,26 +163,4 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { def freqItems(cols: Seq[String]): DataFrame = { FrequentItems.singlePassFreqItems(df, cols, 0.01) } - - /** - * Returns a stratified sample without replacement based on the fraction given on each stratum. - * @param col column that defines strata - * @param fractions sampling fraction for each stratum. If a stratum is not specified, we treat - * its fraction as zero. - * @param seed random seed - * @return a new [[DataFrame]] that
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc1 [deleted] d0a5560ce - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in booelan expression
Repository: spark Updated Branches: refs/heads/branch-1.4 8d6e3636e - 13f7b0a91 [SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in booelan expression It's a common mistake that user will put Column in a boolean expression (together with `and` , `or`), which does not work as expected, we should raise a exception in that case, and suggest user to use ``, `|` instead. Author: Davies Liu dav...@databricks.com Closes #6961 from davies/column_bool and squashes the following commits: 9f19beb [Davies Liu] update message af74bd6 [Davies Liu] fix tests 07dff84 [Davies Liu] address comments, fix tests f70c08e [Davies Liu] raise Exception if column is used in booelan expression (cherry picked from commit 7fb5ae5024284593204779ff463bfbdb4d1c6da5) Signed-off-by: Davies Liu dav...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/13f7b0a9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/13f7b0a9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/13f7b0a9 Branch: refs/heads/branch-1.4 Commit: 13f7b0a910557fb5564636031f43c2674a7dfa28 Parents: 8d6e363 Author: Davies Liu dav...@databricks.com Authored: Tue Jun 23 15:51:16 2015 -0700 Committer: Davies Liu dav...@databricks.com Committed: Tue Jun 23 15:51:35 2015 -0700 -- python/pyspark/sql/column.py | 5 + python/pyspark/sql/tests.py | 10 +- 2 files changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/13f7b0a9/python/pyspark/sql/column.py -- diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 1ecec5b..0a85da7 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -396,6 +396,11 @@ class Column(object): jc = self._jc.over(window._jspec) return Column(jc) +def __nonzero__(self): +raise ValueError(Cannot convert column into bool: please use '' for 'and', '|' for 'or', + '~' for 'not' when building DataFrame boolean expressions.) +__bool__ = __nonzero__ + def __repr__(self): return 'Column%s' % self._jc.toString().encode('utf8') http://git-wip-us.apache.org/repos/asf/spark/blob/13f7b0a9/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 4e72407..5c25890 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -149,6 +149,14 @@ class SQLTests(ReusedPySparkTestCase): self.assertEqual(result[0][0], a) self.assertEqual(result[0][1], b) +def test_and_in_expression(self): +self.assertEqual(4, self.df.filter((self.df.key = 10) (self.df.value = 2)).count()) +self.assertRaises(ValueError, lambda: (self.df.key = 10) and (self.df.value = 2)) +self.assertEqual(14, self.df.filter((self.df.key = 3) | (self.df.value 2)).count()) +self.assertRaises(ValueError, lambda: self.df.key = 3 or self.df.value 2) +self.assertEqual(99, self.df.filter(~(self.df.key == 1)).count()) +self.assertRaises(ValueError, lambda: not self.df.key == 1) + def test_udf_with_callable(self): d = [Row(number=i, squared=i**2) for i in range(10)] rdd = self.sc.parallelize(d) @@ -393,7 +401,7 @@ class SQLTests(ReusedPySparkTestCase): self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column)) rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci) self.assertTrue(all(isinstance(c, Column) for c in rcc)) -cb = [ci == 5, ci != 0, ci 3, ci 4, ci = 0, ci = 7, ci and cs, ci or cs] +cb = [ci == 5, ci != 0, ci 3, ci 4, ci = 0, ci = 7] self.assertTrue(all(isinstance(c, Column) for c in cb)) cbool = (ci ci), (ci | ci), (~ci) self.assertTrue(all(isinstance(c, Column) for c in cbool)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8139] [SQL] Updates docs and comments of data sources and Parquet output committer options
Repository: spark Updated Branches: refs/heads/master 7fb5ae502 - 111d6b9b8 [SPARK-8139] [SQL] Updates docs and comments of data sources and Parquet output committer options This PR only applies to master branch (1.5.0-SNAPSHOT) since it references `org.apache.parquet` classes which only appear in Parquet 1.7.0. Author: Cheng Lian l...@databricks.com Closes #6683 from liancheng/output-committer-docs and squashes the following commits: b4648b8 [Cheng Lian] Removes spark.sql.sources.outputCommitterClass as it's not a public option ee63923 [Cheng Lian] Updates docs and comments of data sources and Parquet output committer options Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/111d6b9b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/111d6b9b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/111d6b9b Branch: refs/heads/master Commit: 111d6b9b8a584b962b6ae80c7aa8c45845ce0099 Parents: 7fb5ae5 Author: Cheng Lian l...@databricks.com Authored: Tue Jun 23 17:24:26 2015 -0700 Committer: Cheng Lian l...@databricks.com Committed: Tue Jun 23 17:24:26 2015 -0700 -- docs/sql-programming-guide.md | 30 - .../scala/org/apache/spark/sql/SQLConf.scala| 30 + .../parquet/DirectParquetOutputCommitter.scala | 34 ++-- .../apache/spark/sql/parquet/newParquet.scala | 4 +-- 4 files changed, 78 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/111d6b9b/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 9107c9b..2786e3d 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1348,6 +1348,34 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext` support. /td /tr +tr + tdcodespark.sql.parquet.output.committer.class/code/td + tdcodeorg.apache.parquet.hadoop.br /ParquetOutputCommitter/code/td + td +p + The output committer class used by Parquet. The specified class needs to be a subclass of + codeorg.apache.hadoop.br /mapreduce.OutputCommitter/code. Typically, it's also a + subclass of codeorg.apache.parquet.hadoop.ParquetOutputCommitter/code. +/p +p + bNote:/b + ul +li + This option must be set via Hadoop codeConfiguration/code rather than Spark + codeSQLConf/code. +/li +li + This option overrides codespark.sql.sources.br /outputCommitterClass/code. +/li + /ul +/p +p + Spark SQL comes with a builtin + codeorg.apache.spark.sql.br /parquet.DirectParquetOutputCommitter/code, which can be more + efficient then the default Parquet output committer when writing data to S3. +/p + /td +/tr /table ## JSON Datasets @@ -1876,7 +1904,7 @@ that these options will be deprecated in future release as more optimizations ar Configures the number of partitions to use when shuffling data for joins or aggregations. /td /tr - tr + tr tdcodespark.sql.planner.externalSort/code/td tdfalse/td td http://git-wip-us.apache.org/repos/asf/spark/blob/111d6b9b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala index 16493c3..2653526 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala @@ -22,6 +22,8 @@ import java.util.Properties import scala.collection.immutable import scala.collection.JavaConversions._ +import org.apache.parquet.hadoop.ParquetOutputCommitter + import org.apache.spark.sql.catalyst.CatalystConf private[spark] object SQLConf { @@ -252,9 +254,9 @@ private[spark] object SQLConf { val PARQUET_FILTER_PUSHDOWN_ENABLED = booleanConf(spark.sql.parquet.filterPushdown, defaultValue = Some(false), -doc = Turn on Parquet filter pushdown optimization. This feature is turned off by default + - because of a known bug in Paruet 1.6.0rc3 + - (a href=\https://issues.apache.org/jira/browse/PARQUET-136\;PARQUET-136/a). However, + +doc = Turn on Parquet filter pushdown optimization. This feature is turned off by default + + because of a known bug in Parquet 1.6.0rc3 + + (PARQUET-136, https://issues.apache.org/jira/browse/PARQUET-136). However, + if your table doesn't contain any nullable string or binary columns, it's still safe to + turn this feature on.) @@ -262,11 +264,21 @@ private[spark]
spark git commit: [SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in booelan expression
Repository: spark Updated Branches: refs/heads/master d96d7b557 - 7fb5ae502 [SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in booelan expression It's a common mistake that user will put Column in a boolean expression (together with `and` , `or`), which does not work as expected, we should raise a exception in that case, and suggest user to use ``, `|` instead. Author: Davies Liu dav...@databricks.com Closes #6961 from davies/column_bool and squashes the following commits: 9f19beb [Davies Liu] update message af74bd6 [Davies Liu] fix tests 07dff84 [Davies Liu] address comments, fix tests f70c08e [Davies Liu] raise Exception if column is used in booelan expression Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7fb5ae50 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7fb5ae50 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7fb5ae50 Branch: refs/heads/master Commit: 7fb5ae5024284593204779ff463bfbdb4d1c6da5 Parents: d96d7b5 Author: Davies Liu dav...@databricks.com Authored: Tue Jun 23 15:51:16 2015 -0700 Committer: Davies Liu dav...@databricks.com Committed: Tue Jun 23 15:51:16 2015 -0700 -- python/pyspark/sql/column.py | 5 + python/pyspark/sql/tests.py | 10 +- 2 files changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7fb5ae50/python/pyspark/sql/column.py -- diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 1ecec5b..0a85da7 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -396,6 +396,11 @@ class Column(object): jc = self._jc.over(window._jspec) return Column(jc) +def __nonzero__(self): +raise ValueError(Cannot convert column into bool: please use '' for 'and', '|' for 'or', + '~' for 'not' when building DataFrame boolean expressions.) +__bool__ = __nonzero__ + def __repr__(self): return 'Column%s' % self._jc.toString().encode('utf8') http://git-wip-us.apache.org/repos/asf/spark/blob/7fb5ae50/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 13f4556..e6a434e 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -164,6 +164,14 @@ class SQLTests(ReusedPySparkTestCase): self.assertEqual(result[0][0], a) self.assertEqual(result[0][1], b) +def test_and_in_expression(self): +self.assertEqual(4, self.df.filter((self.df.key = 10) (self.df.value = 2)).count()) +self.assertRaises(ValueError, lambda: (self.df.key = 10) and (self.df.value = 2)) +self.assertEqual(14, self.df.filter((self.df.key = 3) | (self.df.value 2)).count()) +self.assertRaises(ValueError, lambda: self.df.key = 3 or self.df.value 2) +self.assertEqual(99, self.df.filter(~(self.df.key == 1)).count()) +self.assertRaises(ValueError, lambda: not self.df.key == 1) + def test_udf_with_callable(self): d = [Row(number=i, squared=i**2) for i in range(10)] rdd = self.sc.parallelize(d) @@ -408,7 +416,7 @@ class SQLTests(ReusedPySparkTestCase): self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column)) rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci) self.assertTrue(all(isinstance(c, Column) for c in rcc)) -cb = [ci == 5, ci != 0, ci 3, ci 4, ci = 0, ci = 7, ci and cs, ci or cs] +cb = [ci == 5, ci != 0, ci 3, ci 4, ci = 0, ci = 7] self.assertTrue(all(isinstance(c, Column) for c in cb)) cbool = (ci ci), (ci | ci), (~ci) self.assertTrue(all(isinstance(c, Column) for c in cbool)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org