spark git commit: [SPARK-7235] [SQL] Refactor the grouping sets

2015-06-23 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/master 4f7fbefb8 - 7b1450b66


[SPARK-7235] [SQL] Refactor the grouping sets

The logical plan `Expand` takes the `output` as constructor argument, which 
break the references chain. We need to refactor the code, as well as the column 
pruning.

Author: Cheng Hao hao.ch...@intel.com

Closes #5780 from chenghao-intel/expand and squashes the following commits:

76e4aa4 [Cheng Hao] revert the change for case insenstive
7c10a83 [Cheng Hao] refactor the grouping sets


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7b1450b6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7b1450b6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7b1450b6

Branch: refs/heads/master
Commit: 7b1450b666f88452e7fe969a6d59e8b24842ea39
Parents: 4f7fbef
Author: Cheng Hao hao.ch...@intel.com
Authored: Tue Jun 23 10:52:17 2015 -0700
Committer: Michael Armbrust mich...@databricks.com
Committed: Tue Jun 23 10:52:17 2015 -0700

--
 .../spark/sql/catalyst/analysis/Analyzer.scala  | 55 +++--
 .../catalyst/expressions/namedExpressions.scala |  2 +-
 .../sql/catalyst/optimizer/Optimizer.scala  |  4 +
 .../catalyst/plans/logical/basicOperators.scala | 84 +++-
 .../spark/sql/execution/SparkStrategies.scala   |  4 +-
 5 files changed, 78 insertions(+), 71 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7b1450b6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 6311784..0a3f5a7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -192,49 +192,17 @@ class Analyzer(
   Seq.tabulate(1  c.groupByExprs.length)(i = i)
 }
 
-/**
- * Create an array of Projections for the child projection, and replace 
the projections'
- * expressions which equal GroupBy expressions with Literal(null), if 
those expressions
- * are not set for this grouping set (according to the bit mask).
- */
-private[this] def expand(g: GroupingSets): Seq[Seq[Expression]] = {
-  val result = new scala.collection.mutable.ArrayBuffer[Seq[Expression]]
-
-  g.bitmasks.foreach { bitmask =
-// get the non selected grouping attributes according to the bit mask
-val nonSelectedGroupExprs = ArrayBuffer.empty[Expression]
-var bit = g.groupByExprs.length - 1
-while (bit = 0) {
-  if (((bitmask  bit)  1) == 0) nonSelectedGroupExprs += 
g.groupByExprs(bit)
-  bit -= 1
-}
-
-val substitution = (g.child.output :+ g.gid).map(expr = expr 
transformDown {
-  case x: Expression if nonSelectedGroupExprs.find(_ semanticEquals 
x).isDefined =
-// if the input attribute in the Invalid Grouping Expression set 
of for this group
-// replace it with constant null
-Literal.create(null, expr.dataType)
-  case x if x == g.gid =
-// replace the groupingId with concrete value (the bit mask)
-Literal.create(bitmask, IntegerType)
-})
-
-result += substitution
-  }
-
-  result.toSeq
-}
-
 def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-  case a: Cube if a.resolved =
-GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, 
a.gid)
-  case a: Rollup if a.resolved =
-GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations, 
a.gid)
-  case x: GroupingSets if x.resolved =
+  case a: Cube =
+GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
+  case a: Rollup =
+GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
+  case x: GroupingSets =
+val gid = AttributeReference(VirtualColumn.groupingIdName, 
IntegerType, false)()
 Aggregate(
-  x.groupByExprs :+ x.gid,
+  x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
   x.aggregations,
-  Expand(expand(x), x.child.output :+ x.gid, x.child))
+  Expand(x.bitmasks, x.groupByExprs, gid, x.child))
 }
   }
 
@@ -368,12 +336,7 @@ class Analyzer(
 
   case q: LogicalPlan =
 logTrace(sAttempting to resolve ${q.simpleString})
-q transformExpressionsUp {
-  case u @ UnresolvedAttribute(nameParts) if nameParts.length == 1 
-resolver(nameParts(0), VirtualColumn.groupingIdName) 
-q.isInstanceOf[GroupingAnalytics] =
-

spark git commit: [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer

2015-06-23 Thread joshrosen
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 929479675 - 334824505


[SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer

Author: Holden Karau hol...@pigscanfly.ca

Closes #6918 from 
holdenk/SPARK-8498-fix-npe-in-errorhandling-path-in-unsafeshuffle-writer and 
squashes the following commits:

f807832 [Holden Karau] Log error if we can't throw it
855f9aa [Holden Karau] Spelling - not my strongest suite. Fix Propegates to 
Propagates.
039d620 [Holden Karau] Add missing closeandwriteoutput
30e558d [Holden Karau] go back to try/finally
e503b8c [Holden Karau] Improve the test to ensure we aren't masking the 
underlying exception
ae0b7a7 [Holden Karau] Fix the test
2e6abf7 [Holden Karau] Be more cautious when cleaning up during failed write 
and re-throw user exceptions

(cherry picked from commit 0f92be5b5f017b593bd29d4da7e89aad2b3adac2)
Signed-off-by: Josh Rosen joshro...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33482450
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33482450
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33482450

Branch: refs/heads/branch-1.4
Commit: 33482450559028b907a0473277297cc54429322e
Parents: 9294796
Author: Holden Karau hol...@pigscanfly.ca
Authored: Tue Jun 23 09:08:11 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Tue Jun 23 09:08:49 2015 -0700

--
 .../spark/shuffle/unsafe/UnsafeShuffleWriter.java | 18 --
 .../shuffle/unsafe/UnsafeShuffleWriterSuite.java  | 17 +
 2 files changed, 33 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/33482450/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
--
diff --git 
a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java 
b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
index ad7eb04..764578b 100644
--- 
a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
+++ 
b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
@@ -139,6 +139,9 @@ public class UnsafeShuffleWriterK, V extends 
ShuffleWriterK, V {
 
   @Override
   public void write(scala.collection.IteratorProduct2K, V records) throws 
IOException {
+// Keep track of success so we know if we ecountered an exception
+// We do this rather than a standard try/catch/re-throw to handle
+// generic throwables.
 boolean success = false;
 try {
   while (records.hasNext()) {
@@ -147,8 +150,19 @@ public class UnsafeShuffleWriterK, V extends 
ShuffleWriterK, V {
   closeAndWriteOutput();
   success = true;
 } finally {
-  if (!success) {
-sorter.cleanupAfterError();
+  if (sorter != null) {
+try {
+  sorter.cleanupAfterError();
+} catch (Exception e) {
+  // Only throw this error if we won't be masking another
+  // error.
+  if (success) {
+throw e;
+  } else {
+logger.error(In addition to a failure during writing, we failed 
during  +
+ cleanup., e);
+  }
+}
   }
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/33482450/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
--
diff --git 
a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
 
b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 83d1091..10c3eed 100644
--- 
a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ 
b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -253,6 +253,23 @@ public class UnsafeShuffleWriterSuite {
 createWriter(false).stop(false);
   }
 
+  class PandaException extends RuntimeException {
+  }
+
+  @Test(expected=PandaException.class)
+  public void writeFailurePropagates() throws Exception {
+class BadRecords extends 
scala.collection.AbstractIteratorProduct2Object, Object {
+  @Override public boolean hasNext() {
+throw new PandaException();
+  }
+  @Override public Product2Object, Object next() {
+return null;
+  }
+}
+final UnsafeShuffleWriterObject, Object writer = createWriter(true);
+writer.write(new BadRecords());
+  }
+
   @Test
   public void writeEmptyIterator() throws Exception {
 final UnsafeShuffleWriterObject, Object writer = createWriter(true);


-
To unsubscribe, e-mail: 

spark git commit: Revert [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer

2015-06-23 Thread joshrosen
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 334824505 - 77cb1d5ed


Revert [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle 
writer

This reverts commit 33482450559028b907a0473277297cc54429322e.

Reverting because `catch (Exception e) ... throw e` doesn't compile under
Java 6 unless the method declares that it throws Exception.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/77cb1d5e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/77cb1d5e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/77cb1d5e

Branch: refs/heads/branch-1.4
Commit: 77cb1d5ed1d0343b512856e24d9c14556236364b
Parents: 3348245
Author: Josh Rosen joshro...@databricks.com
Authored: Tue Jun 23 09:19:11 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Tue Jun 23 09:19:11 2015 -0700

--
 .../spark/shuffle/unsafe/UnsafeShuffleWriter.java | 18 ++
 .../shuffle/unsafe/UnsafeShuffleWriterSuite.java  | 17 -
 2 files changed, 2 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/77cb1d5e/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
--
diff --git 
a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java 
b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
index 764578b..ad7eb04 100644
--- 
a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
+++ 
b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
@@ -139,9 +139,6 @@ public class UnsafeShuffleWriterK, V extends 
ShuffleWriterK, V {
 
   @Override
   public void write(scala.collection.IteratorProduct2K, V records) throws 
IOException {
-// Keep track of success so we know if we ecountered an exception
-// We do this rather than a standard try/catch/re-throw to handle
-// generic throwables.
 boolean success = false;
 try {
   while (records.hasNext()) {
@@ -150,19 +147,8 @@ public class UnsafeShuffleWriterK, V extends 
ShuffleWriterK, V {
   closeAndWriteOutput();
   success = true;
 } finally {
-  if (sorter != null) {
-try {
-  sorter.cleanupAfterError();
-} catch (Exception e) {
-  // Only throw this error if we won't be masking another
-  // error.
-  if (success) {
-throw e;
-  } else {
-logger.error(In addition to a failure during writing, we failed 
during  +
- cleanup., e);
-  }
-}
+  if (!success) {
+sorter.cleanupAfterError();
   }
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/77cb1d5e/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
--
diff --git 
a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
 
b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 10c3eed..83d1091 100644
--- 
a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ 
b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -253,23 +253,6 @@ public class UnsafeShuffleWriterSuite {
 createWriter(false).stop(false);
   }
 
-  class PandaException extends RuntimeException {
-  }
-
-  @Test(expected=PandaException.class)
-  public void writeFailurePropagates() throws Exception {
-class BadRecords extends 
scala.collection.AbstractIteratorProduct2Object, Object {
-  @Override public boolean hasNext() {
-throw new PandaException();
-  }
-  @Override public Product2Object, Object next() {
-return null;
-  }
-}
-final UnsafeShuffleWriterObject, Object writer = createWriter(true);
-writer.write(new BadRecords());
-  }
-
   @Test
   public void writeEmptyIterator() throws Exception {
 final UnsafeShuffleWriterObject, Object writer = createWriter(true);


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SQL] [DOCS] updated the documentation for explode

2015-06-23 Thread sarutak
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 77cb1d5ed - 27693e175


[SQL] [DOCS] updated the documentation for explode

the syntax was incorrect in the example in explode

Author: lockwobr lockw...@gmail.com

Closes #6943 from lockwobr/master and squashes the following commits:

3d864d1 [lockwobr] updated the documentation for explode

(cherry picked from commit 4f7fbefb8db56ecaab66bb0ac2ab124416fefe58)
Signed-off-by: Kousuke Saruta saru...@oss.nttdata.co.jp


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27693e17
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27693e17
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27693e17

Branch: refs/heads/branch-1.4
Commit: 27693e175715a7d6901ec79ab510b883a4cb0fb2
Parents: 77cb1d5
Author: lockwobr lockw...@gmail.com
Authored: Wed Jun 24 02:48:56 2015 +0900
Committer: Kousuke Saruta saru...@oss.nttdata.co.jp
Committed: Wed Jun 24 02:51:36 2015 +0900

--
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/27693e17/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 2a01824..29bba18 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -999,7 +999,7 @@ class DataFrame private[sql](
* columns of the input row are implicitly joined with each value that is 
output by the function.
*
* {{{
-   *   df.explode(words, word)(words: String = words.split( ))
+   *   df.explode(words, word){words: String = words.split( )}
* }}}
* @group dfops
* @since 1.3.0


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer

2015-06-23 Thread joshrosen
Repository: spark
Updated Branches:
  refs/heads/master 6ceb16960 - 0f92be5b5


[SPARK-8498] [TUNGSTEN] fix npe in errorhandling path in unsafeshuffle writer

Author: Holden Karau hol...@pigscanfly.ca

Closes #6918 from 
holdenk/SPARK-8498-fix-npe-in-errorhandling-path-in-unsafeshuffle-writer and 
squashes the following commits:

f807832 [Holden Karau] Log error if we can't throw it
855f9aa [Holden Karau] Spelling - not my strongest suite. Fix Propegates to 
Propagates.
039d620 [Holden Karau] Add missing closeandwriteoutput
30e558d [Holden Karau] go back to try/finally
e503b8c [Holden Karau] Improve the test to ensure we aren't masking the 
underlying exception
ae0b7a7 [Holden Karau] Fix the test
2e6abf7 [Holden Karau] Be more cautious when cleaning up during failed write 
and re-throw user exceptions


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f92be5b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f92be5b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f92be5b

Branch: refs/heads/master
Commit: 0f92be5b5f017b593bd29d4da7e89aad2b3adac2
Parents: 6ceb169
Author: Holden Karau hol...@pigscanfly.ca
Authored: Tue Jun 23 09:08:11 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Tue Jun 23 09:08:11 2015 -0700

--
 .../spark/shuffle/unsafe/UnsafeShuffleWriter.java | 18 --
 .../shuffle/unsafe/UnsafeShuffleWriterSuite.java  | 17 +
 2 files changed, 33 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0f92be5b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
--
diff --git 
a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java 
b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
index ad7eb04..764578b 100644
--- 
a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
+++ 
b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
@@ -139,6 +139,9 @@ public class UnsafeShuffleWriterK, V extends 
ShuffleWriterK, V {
 
   @Override
   public void write(scala.collection.IteratorProduct2K, V records) throws 
IOException {
+// Keep track of success so we know if we ecountered an exception
+// We do this rather than a standard try/catch/re-throw to handle
+// generic throwables.
 boolean success = false;
 try {
   while (records.hasNext()) {
@@ -147,8 +150,19 @@ public class UnsafeShuffleWriterK, V extends 
ShuffleWriterK, V {
   closeAndWriteOutput();
   success = true;
 } finally {
-  if (!success) {
-sorter.cleanupAfterError();
+  if (sorter != null) {
+try {
+  sorter.cleanupAfterError();
+} catch (Exception e) {
+  // Only throw this error if we won't be masking another
+  // error.
+  if (success) {
+throw e;
+  } else {
+logger.error(In addition to a failure during writing, we failed 
during  +
+ cleanup., e);
+  }
+}
   }
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/0f92be5b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
--
diff --git 
a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
 
b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 83d1091..10c3eed 100644
--- 
a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ 
b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -253,6 +253,23 @@ public class UnsafeShuffleWriterSuite {
 createWriter(false).stop(false);
   }
 
+  class PandaException extends RuntimeException {
+  }
+
+  @Test(expected=PandaException.class)
+  public void writeFailurePropagates() throws Exception {
+class BadRecords extends 
scala.collection.AbstractIteratorProduct2Object, Object {
+  @Override public boolean hasNext() {
+throw new PandaException();
+  }
+  @Override public Product2Object, Object next() {
+return null;
+  }
+}
+final UnsafeShuffleWriterObject, Object writer = createWriter(true);
+writer.write(new BadRecords());
+  }
+
   @Test
   public void writeEmptyIterator() throws Exception {
 final UnsafeShuffleWriterObject, Object writer = createWriter(true);


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SQL] [DOCS] updated the documentation for explode

2015-06-23 Thread sarutak
Repository: spark
Updated Branches:
  refs/heads/master 0f92be5b5 - 4f7fbefb8


[SQL] [DOCS] updated the documentation for explode

the syntax was incorrect in the example in explode

Author: lockwobr lockw...@gmail.com

Closes #6943 from lockwobr/master and squashes the following commits:

3d864d1 [lockwobr] updated the documentation for explode


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f7fbefb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f7fbefb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f7fbefb

Branch: refs/heads/master
Commit: 4f7fbefb8db56ecaab66bb0ac2ab124416fefe58
Parents: 0f92be5
Author: lockwobr lockw...@gmail.com
Authored: Wed Jun 24 02:48:56 2015 +0900
Committer: Kousuke Saruta saru...@oss.nttdata.co.jp
Committed: Wed Jun 24 02:48:56 2015 +0900

--
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4f7fbefb/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 492a332..f3f0f53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1049,7 +1049,7 @@ class DataFrame private[sql](
* columns of the input row are implicitly joined with each value that is 
output by the function.
*
* {{{
-   *   df.explode(words, word)(words: String = words.split( ))
+   *   df.explode(words, word){words: String = words.split( )}
* }}}
* @group dfops
* @since 1.3.0


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8265] [MLLIB] [PYSPARK] Add LinearDataGenerator to pyspark.mllib.utils

2015-06-23 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master 2bdd0 - f2022fa0d


[SPARK-8265] [MLLIB] [PYSPARK] Add LinearDataGenerator to pyspark.mllib.utils

It is useful to generate linear data for easy testing of linear models and in 
general. Scala already has it. This is just a wrapper around the Scala code.

Author: MechCoder manojkumarsivaraj...@gmail.com

Closes #6715 from MechCoder/generate_linear_input and squashes the following 
commits:

6182884 [MechCoder] Minor changes
8bda047 [MechCoder] Minor style fixes
0f1053c [MechCoder] [SPARK-8265] Add LinearDataGenerator to pyspark.mllib.utils


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2022fa0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2022fa0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2022fa0

Branch: refs/heads/master
Commit: f2022fa0d375c804eca7803e172543b23ecbb9b7
Parents: 2bd
Author: MechCoder manojkumarsivaraj...@gmail.com
Authored: Tue Jun 23 12:43:32 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Jun 23 12:43:32 2015 -0700

--
 .../spark/mllib/api/python/PythonMLLibAPI.scala | 32 +-
 python/pyspark/mllib/tests.py   | 22 ++--
 python/pyspark/mllib/util.py| 35 
 3 files changed, 86 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f2022fa0/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index f9a271f..c4bea7c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -51,6 +51,7 @@ import org.apache.spark.mllib.tree.loss.Losses
 import org.apache.spark.mllib.tree.model.{DecisionTreeModel, 
GradientBoostedTreesModel, RandomForestModel}
 import org.apache.spark.mllib.tree.{DecisionTree, GradientBoostedTrees, 
RandomForest}
 import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.mllib.util.LinearDataGenerator
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.storage.StorageLevel
@@ -972,7 +973,7 @@ private[python] class PythonMLLibAPI extends Serializable {
   def estimateKernelDensity(
   sample: JavaRDD[Double],
   bandwidth: Double, points: java.util.ArrayList[Double]): Array[Double] = 
{
-return new 
KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
+new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
   points.asScala.toArray)
   }
 
@@ -991,6 +992,35 @@ private[python] class PythonMLLibAPI extends Serializable {
   List[AnyRef](model.clusterCenters, 
Vectors.dense(model.clusterWeights)).asJava
   }
 
+  /**
+   * Wrapper around the generateLinearInput method of LinearDataGenerator.
+   */
+  def generateLinearInputWrapper(
+  intercept: Double,
+  weights: JList[Double],
+  xMean: JList[Double],
+  xVariance: JList[Double],
+  nPoints: Int,
+  seed: Int,
+  eps: Double): Array[LabeledPoint] = {
+LinearDataGenerator.generateLinearInput(
+  intercept, weights.asScala.toArray, xMean.asScala.toArray,
+  xVariance.asScala.toArray, nPoints, seed, eps).toArray
+  }
+
+  /**
+   * Wrapper around the generateLinearRDD method of LinearDataGenerator.
+   */
+  def generateLinearRDDWrapper(
+  sc: JavaSparkContext,
+  nexamples: Int,
+  nfeatures: Int,
+  eps: Double,
+  nparts: Int,
+  intercept: Double): JavaRDD[LabeledPoint] = {
+LinearDataGenerator.generateLinearRDD(
+  sc, nexamples, nfeatures, eps, nparts, intercept)
+  }
 }
 
 /**

http://git-wip-us.apache.org/repos/asf/spark/blob/f2022fa0/python/pyspark/mllib/tests.py
--
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index c8d61b9..509faa1 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -49,8 +49,8 @@ from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.stat import Statistics
 from pyspark.mllib.feature import Word2Vec
 from pyspark.mllib.feature import IDF
-from pyspark.mllib.feature import StandardScaler
-from pyspark.mllib.feature import ElementwiseProduct
+from pyspark.mllib.feature import StandardScaler, ElementwiseProduct
+from pyspark.mllib.util import LinearDataGenerator
 from pyspark.serializers import PickleSerializer
 from pyspark.streaming import StreamingContext
 from pyspark.sql import SQLContext

spark git commit: [SPARK-7888] Be able to disable intercept in linear regression in ml package

2015-06-23 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/master 6f4cadf5e - 2bdd0


[SPARK-7888] Be able to disable intercept in linear regression in ml package

Author: Holden Karau hol...@pigscanfly.ca

Closes #6927 from 
holdenk/SPARK-7888-Be-able-to-disable-intercept-in-Linear-Regression-in-ML-package
 and squashes the following commits:

0ad384c [Holden Karau] Add MiMa excludes
4016fac [Holden Karau] Switch to wild card import, remove extra blank lines
ae5baa8 [Holden Karau] CR feedback, move the fitIntercept down rather than 
changing ymean and etc above
f34971c [Holden Karau] Fix some more long lines
319bd3f [Holden Karau] Fix long lines
3bb9ee1 [Holden Karau] Update the regression suite tests
7015b9f [Holden Karau] Our code performs the same with R, except we need more 
than one data point but that seems reasonable
0b0c8c0 [Holden Karau] fix the issue with the sample R code
e2140ba [Holden Karau] Add a test, it fails!
5e84a0b [Holden Karau] Write out thoughts and use the correct trait
91ffc0a [Holden Karau] more murh
006246c [Holden Karau] murp?


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2bdd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2bdd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2bdd

Branch: refs/heads/master
Commit: 2bdd0b8deb9ad8d43fec792e60e3d0c4de75
Parents: 6f4cadf
Author: Holden Karau hol...@pigscanfly.ca
Authored: Tue Jun 23 12:42:17 2015 -0700
Committer: DB Tsai d...@netflix.com
Committed: Tue Jun 23 12:42:17 2015 -0700

--
 .../spark/ml/regression/LinearRegression.scala  |  30 +++-
 .../ml/regression/LinearRegressionSuite.scala   | 149 ++-
 project/MimaExcludes.scala  |   5 +
 3 files changed, 172 insertions(+), 12 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2bdd/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 0130654..1b1d729 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -26,7 +26,7 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, 
HasRegParam, HasTol}
+import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
@@ -41,7 +41,8 @@ import org.apache.spark.util.StatCounter
  * Params for linear regression.
  */
 private[regression] trait LinearRegressionParams extends PredictorParams
-  with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
+with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
+with HasFitIntercept
 
 /**
  * :: Experimental ::
@@ -73,6 +74,14 @@ class LinearRegression(override val uid: String)
   setDefault(regParam - 0.0)
 
   /**
+   * Set if we should fit the intercept
+   * Default is true.
+   * @group setParam
+   */
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept - true)
+
+  /**
* Set the ElasticNet mixing parameter.
* For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 
penalty.
* For 0  alpha  1, the penalty is a combination of L1 and L2.
@@ -123,6 +132,7 @@ class LinearRegression(override val uid: String)
 val numFeatures = summarizer.mean.size
 val yMean = statCounter.mean
 val yStd = math.sqrt(statCounter.variance)
+// look at glmnet5.m L761 maaaybe that has info
 
 // If the yStd is zero, then the intercept is yMean with zero weights;
 // as a result, training is not needed.
@@ -142,7 +152,7 @@ class LinearRegression(override val uid: String)
 val effectiveL1RegParam = $(elasticNetParam) * effectiveRegParam
 val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam
 
-val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
+val costFun = new LeastSquaresCostFun(instances, yStd, yMean, 
$(fitIntercept),
   featuresStd, featuresMean, effectiveL2RegParam)
 
 val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) 
{
@@ -180,7 +190,7 @@ class LinearRegression(override val uid: String)
 // The intercept in R's GLMNET is computed using closed form after the 
coefficients are
 // converged. See the following discussion for 

spark git commit: [SPARK-8111] [SPARKR] SparkR shell should display Spark logo and version banner on startup.

2015-06-23 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master f2022fa0d - f2fb0285a


[SPARK-8111] [SPARKR] SparkR shell should display Spark logo and version banner 
on startup.

spark version is taken from the environment variable SPARK_VERSION

Author: Alok  Singh singhal@Aloks-MacBook-Pro.local
Author: Alok  Singh sing...@aloks-mbp.usca.ibm.com

Closes #6944 from aloknsingh/aloknsingh_spark_jiras and squashes the following 
commits:

ed607bd [Alok  Singh] [SPARK-8111][SparkR] As per suggestion, 1) using the 
version from sparkContext rather than the Sys.env. 2) change Welcome to 
SparkR! to Welcome to followed by Spark logo and version
acd5b85 [Alok  Singh] fix the jira SPARK-8111 to add the spark version and 
logo. Currently spark version is taken from the environment variable 
SPARK_VERSION


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2fb0285
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2fb0285
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2fb0285

Branch: refs/heads/master
Commit: f2fb0285ab6d4225c5350f109dea6c1c017bb491
Parents: f2022fa
Author: Alok  Singh singhal@Aloks-MacBook-Pro.local
Authored: Tue Jun 23 12:47:55 2015 -0700
Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu
Committed: Tue Jun 23 12:47:55 2015 -0700

--
 R/pkg/inst/profile/shell.R | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f2fb0285/R/pkg/inst/profile/shell.R
--
diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R
index 773b6ec..7189f1a 100644
--- a/R/pkg/inst/profile/shell.R
+++ b/R/pkg/inst/profile/shell.R
@@ -27,7 +27,21 @@
   sc - SparkR::sparkR.init()
   assign(sc, sc, envir=.GlobalEnv)
   sqlContext - SparkR::sparkRSQL.init(sc)
+  sparkVer - SparkR:::callJMethod(sc, version)
   assign(sqlContext, sqlContext, envir=.GlobalEnv)
-  cat(\n Welcome to SparkR!)
+  cat(\n Welcome to)
+  cat(\n)
+  cat(  __, \n)
+  cat(   / __/__  ___ _/ /__, \n)
+  cat(  _\\ \\/ _ \\/ _ `/ __/  '_/, \n)
+  cat( /___/ .__/\\_,_/_/ /_/\\_\\)
+  if (nchar(sparkVer) == 0) {
+cat(\n)
+  } else {
+cat(   version , sparkVer, \n) 
+  }
+  cat(/_/, \n)
+  cat(\n)
+
   cat(\n Spark context is available as sc, SQL context is available as 
sqlContext\n)
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8432] [SQL] fix hashCode() and equals() of BinaryType in Row

2015-06-23 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/master 7b1450b66 - 6f4cadf5e


[SPARK-8432] [SQL] fix hashCode() and equals() of BinaryType in Row

Also added more tests in LiteralExpressionSuite

Author: Davies Liu dav...@databricks.com

Closes #6876 from davies/fix_hashcode and squashes the following commits:

429c2c0 [Davies Liu] Merge branch 'master' of github.com:apache/spark into 
fix_hashcode
32d9811 [Davies Liu] fix test
a0626ed [Davies Liu] Merge branch 'master' of github.com:apache/spark into 
fix_hashcode
89c2432 [Davies Liu] fix style
bd20780 [Davies Liu] check with catalyst types
41caec6 [Davies Liu] change for to while
d96929b [Davies Liu] address comment
6ad2a90 [Davies Liu] fix style
5819d33 [Davies Liu] unify equals() and hashCode()
0fff25d [Davies Liu] fix style
53c38b1 [Davies Liu] fix hashCode() and equals() of BinaryType in Row


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f4cadf5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f4cadf5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f4cadf5

Branch: refs/heads/master
Commit: 6f4cadf5ee81467d077febc53d36571dd232295d
Parents: 7b1450b
Author: Davies Liu dav...@databricks.com
Authored: Tue Jun 23 11:55:47 2015 -0700
Committer: Michael Armbrust mich...@databricks.com
Committed: Tue Jun 23 11:55:47 2015 -0700

--
 .../main/java/org/apache/spark/sql/BaseRow.java | 21 --
 .../main/scala/org/apache/spark/sql/Row.scala   | 32 --
 .../apache/spark/sql/catalyst/InternalRow.scala | 67 +++-
 .../codegen/GenerateProjection.scala|  1 +
 .../spark/sql/catalyst/expressions/rows.scala   | 52 ---
 .../expressions/ExpressionEvalHelper.scala  | 27 ++--
 .../expressions/LiteralExpressionSuite.scala| 61 +++---
 .../expressions/StringFunctionsSuite.scala  |  5 +-
 .../apache/spark/unsafe/types/UTF8String.java   |  6 +-
 .../spark/unsafe/types/UTF8StringSuite.java |  2 -
 10 files changed, 139 insertions(+), 135 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6f4cadf5/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
--
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java 
b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
index 611e02d..6a2356f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/BaseRow.java
@@ -155,27 +155,6 @@ public abstract class BaseRow extends InternalRow {
 throw new UnsupportedOperationException();
   }
 
-  /**
-   * A generic version of Row.equals(Row), which is used for tests.
-   */
-  @Override
-  public boolean equals(Object other) {
-if (other instanceof Row) {
-  Row row = (Row) other;
-  int n = size();
-  if (n != row.size()) {
-return false;
-  }
-  for (int i = 0; i  n; i ++) {
-if (isNullAt(i) != row.isNullAt(i) || (!isNullAt(i)  
!get(i).equals(row.get(i {
-  return false;
-}
-  }
-  return true;
-}
-return false;
-  }
-
   @Override
   public InternalRow copy() {
 final int n = size();

http://git-wip-us.apache.org/repos/asf/spark/blob/6f4cadf5/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
--
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 8aaf5d7..e99d5c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import scala.util.hashing.MurmurHash3
-
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.types.StructType
 
@@ -365,36 +363,6 @@ trait Row extends Serializable {
 false
   }
 
-  override def equals(that: Any): Boolean = that match {
-case null = false
-case that: Row =
-  if (this.length != that.length) {
-return false
-  }
-  var i = 0
-  val len = this.length
-  while (i  len) {
-if (apply(i) != that.apply(i)) {
-  return false
-}
-i += 1
-  }
-  true
-case _ = false
-  }
-
-  override def hashCode: Int = {
-// Using Scala's Seq hash code implementation.
-var n = 0
-var h = MurmurHash3.seqSeed
-val len = length
-while (n  len) {
-  h = MurmurHash3.mix(h, apply(n).##)
-  n += 1
-}
-MurmurHash3.finalizeHash(h, n)
-  }
-
   /* -- utility methods for Scala -- */
 
   /**


spark git commit: [DOC] [SQL] Addes Hive metastore Parquet table conversion section

2015-06-23 Thread lian
Repository: spark
Updated Branches:
  refs/heads/master a8031183a - d96d7b557


[DOC] [SQL] Addes Hive metastore Parquet table conversion section

This PR adds a section about Hive metastore Parquet table conversion. It 
documents:

1. Schema reconciliation rules introduced in #5214 (see [this comment] [1] in 
#5188)
2. Metadata refreshing requirement introduced in #5339

[1]: https://github.com/apache/spark/pull/5188#issuecomment-86531248

Author: Cheng Lian l...@databricks.com

Closes #5348 from liancheng/sql-doc-parquet-conversion and squashes the 
following commits:

42ae0d0 [Cheng Lian] Adds Python `refreshTable` snippet
4c9847d [Cheng Lian] Resorts to SQL for Python metadata refreshing snippet
756e660 [Cheng Lian] Adds Python snippet for metadata refreshing
50675db [Cheng Lian] Addes Hive metastore Parquet table conversion section


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d96d7b55
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d96d7b55
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d96d7b55

Branch: refs/heads/master
Commit: d96d7b55746cf034e3935ec4b22614a99e48c498
Parents: a803118
Author: Cheng Lian l...@databricks.com
Authored: Tue Jun 23 14:19:21 2015 -0700
Committer: Cheng Lian l...@databricks.com
Committed: Tue Jun 23 14:19:21 2015 -0700

--
 docs/sql-programming-guide.md | 94 +++---
 1 file changed, 88 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d96d7b55/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 26c036f..9107c9b 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -22,7 +22,7 @@ The DataFrame API is available in 
[Scala](api/scala/index.html#org.apache.spark.
 All of the examples on this page use sample data included in the Spark 
distribution and can be run in the `spark-shell`, `pyspark` shell, or `sparkR` 
shell.
 
 
-## Starting Point: `SQLContext`
+## Starting Point: SQLContext
 
 div class=codetabs
 div data-lang=scala  markdown=1
@@ -1036,6 +1036,15 @@ for (teenName in collect(teenNames)) {
 
 /div
 
+div data-lang=python  markdown=1
+
+{% highlight python %}
+# sqlContext is an existing HiveContext
+sqlContext.sql(REFRESH TABLE my_table)
+{% endhighlight %}
+
+/div
+
 div data-lang=sql  markdown=1
 
 {% highlight sql %}
@@ -1054,7 +1063,7 @@ SELECT * FROM parquetTable
 
 /div
 
-### Partition discovery
+### Partition Discovery
 
 Table partitioning is a common optimization approach used in systems like 
Hive.  In a partitioned
 table, data are usually stored in different directories, with partitioning 
column values encoded in
@@ -1108,7 +1117,7 @@ can be configured by 
`spark.sql.sources.partitionColumnTypeInference.enabled`, w
 `true`. When type inference is disabled, string type will be used for the 
partitioning columns.
 
 
-### Schema merging
+### Schema Merging
 
 Like ProtocolBuffer, Avro, and Thrift, Parquet also supports schema evolution. 
 Users can start with
 a simple schema, and gradually add more columns to the schema as needed.  In 
this way, users may end
@@ -1208,6 +1217,79 @@ printSchema(df3)
 
 /div
 
+### Hive metastore Parquet table conversion
+
+When reading from and writing to Hive metastore Parquet tables, Spark SQL will 
try to use its own
+Parquet support instead of Hive SerDe for better performance. This behavior is 
controlled by the
+`spark.sql.hive.convertMetastoreParquet` configuration, and is turned on by 
default.
+
+ Hive/Parquet Schema Reconciliation
+
+There are two key differences between Hive and Parquet from the perspective of 
table schema
+processing.
+
+1. Hive is case insensitive, while Parquet is not
+1. Hive considers all columns nullable, while nullability in Parquet is 
significant
+
+Due to this reason, we must reconcile Hive metastore schema with Parquet 
schema when converting a
+Hive metastore Parquet table to a Spark SQL Parquet table.  The reconciliation 
rules are:
+
+1. Fields that have the same name in both schema must have the same data type 
regardless of
+   nullability.  The reconciled field should have the data type of the Parquet 
side, so that
+   nullability is respected.
+
+1. The reconciled schema contains exactly those fields defined in Hive 
metastore schema.
+
+   - Any fields that only appear in the Parquet schema are dropped in the 
reconciled schema.
+   - Any fileds that only appear in the Hive metastore schema are added as 
nullable field in the
+ reconciled schema.
+
+ Metadata Refreshing
+
+Spark SQL caches Parquet metadata for better performance.  When Hive metastore 
Parquet table
+conversion is enabled, metadata of those 

spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector

2015-06-23 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.1 36eed2f9e - faa35ca05


[SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between 
label and features vector

fix LabeledPoint parser when there is a whitespace between label and features 
vector, e.g.
(y, [x1, x2, x3])

Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com

Closes #6954 from fe2s/SPARK-8525 and squashes the following commits:

0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep 
on commons-lang
c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when 
there is a whitespace on specific position

(cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/faa35ca0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/faa35ca0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/faa35ca0

Branch: refs/heads/branch-1.1
Commit: faa35ca05baba40e82bfeb79d4c9f47a5ef62bcc
Parents: 36eed2f
Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com
Authored: Tue Jun 23 13:12:19 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Jun 23 13:18:05 2015 -0700

--
 .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++
 .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 +
 .../org/apache/spark/mllib/util/NumericParserSuite.scala  | 7 +++
 3 files changed, 14 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/faa35ca0/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index f7cba6c..629b0af 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
 }
   } else if (token == )) {
 parsing = false
+  } else if (token.trim.isEmpty){
+  // ignore whitespaces between delim chars, e.g. , [
   } else {
 // expecting a number
 items.append(parseDouble(token))

http://git-wip-us.apache.org/repos/asf/spark/blob/faa35ca0/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index 110c44a..fc08bac 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -32,6 +32,11 @@ class LabeledPointSuite extends FunSuite {
 }
   }
 
+  test(parse labeled points with whitespaces) {
+val point = LabeledPoint.parse((0.0, [1.0, 2.0]))
+assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+  }
+
   test(parse labeled points with v0.9 format) {
 val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0)
 assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))

http://git-wip-us.apache.org/repos/asf/spark/blob/faa35ca0/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index f68fb95..5027311 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -39,4 +39,11 @@ class NumericParserSuite extends FunSuite {
   }
 }
   }
+
+  test(parser with whitespaces) {
+val s = (0.0, [1.0, 2.0])
+val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+assert(parsed(0).asInstanceOf[Double] === 0.0)
+assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector

2015-06-23 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 27693e175 - 8d6e3636e


[SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between 
label and features vector

fix LabeledPoint parser when there is a whitespace between label and features 
vector, e.g.
(y, [x1, x2, x3])

Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com

Closes #6954 from fe2s/SPARK-8525 and squashes the following commits:

0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep 
on commons-lang
c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when 
there is a whitespace on specific position

(cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d6e3636
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d6e3636
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d6e3636

Branch: refs/heads/branch-1.4
Commit: 8d6e3636e9599db0e45e9e6f4e71a08cdc91e11f
Parents: 27693e1
Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com
Authored: Tue Jun 23 13:12:19 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Jun 23 13:15:27 2015 -0700

--
 .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++
 .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 +
 .../org/apache/spark/mllib/util/NumericParserSuite.scala  | 7 +++
 3 files changed, 14 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8d6e3636/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index 308f7f3..a841c5c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
 }
   } else if (token == )) {
 parsing = false
+  } else if (token.trim.isEmpty){
+  // ignore whitespaces between delim chars, e.g. , [
   } else {
 // expecting a number
 items.append(parseDouble(token))

http://git-wip-us.apache.org/repos/asf/spark/blob/8d6e3636/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index d8364a0..f8d0af8 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite {
 }
   }
 
+  test(parse labeled points with whitespaces) {
+val point = LabeledPoint.parse((0.0, [1.0, 2.0]))
+assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+  }
+
   test(parse labeled points with v0.9 format) {
 val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0)
 assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))

http://git-wip-us.apache.org/repos/asf/spark/blob/8d6e3636/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index 8dcb9ba..fa4f74d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite {
   }
 }
   }
+
+  test(parser with whitespaces) {
+val s = (0.0, [1.0, 2.0])
+val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+assert(parsed(0).asInstanceOf[Double] === 0.0)
+assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector

2015-06-23 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master f2fb0285a - a8031183a


[SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between 
label and features vector

fix LabeledPoint parser when there is a whitespace between label and features 
vector, e.g.
(y, [x1, x2, x3])

Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com

Closes #6954 from fe2s/SPARK-8525 and squashes the following commits:

0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep 
on commons-lang
c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when 
there is a whitespace on specific position


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a8031183
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a8031183
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a8031183

Branch: refs/heads/master
Commit: a8031183aff2e23de9204ddfc7e7f5edbf052a7e
Parents: f2fb028
Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com
Authored: Tue Jun 23 13:12:19 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Jun 23 13:12:19 2015 -0700

--
 .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++
 .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 +
 .../org/apache/spark/mllib/util/NumericParserSuite.scala  | 7 +++
 3 files changed, 14 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a8031183/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index 308f7f3..a841c5c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
 }
   } else if (token == )) {
 parsing = false
+  } else if (token.trim.isEmpty){
+  // ignore whitespaces between delim chars, e.g. , [
   } else {
 // expecting a number
 items.append(parseDouble(token))

http://git-wip-us.apache.org/repos/asf/spark/blob/a8031183/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index d8364a0..f8d0af8 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite {
 }
   }
 
+  test(parse labeled points with whitespaces) {
+val point = LabeledPoint.parse((0.0, [1.0, 2.0]))
+assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+  }
+
   test(parse labeled points with v0.9 format) {
 val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0)
 assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))

http://git-wip-us.apache.org/repos/asf/spark/blob/a8031183/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index 8dcb9ba..fa4f74d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite {
   }
 }
   }
+
+  test(parser with whitespaces) {
+val s = (0.0, [1.0, 2.0])
+val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+assert(parsed(0).asInstanceOf[Double] === 0.0)
+assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector

2015-06-23 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.3 716dcf631 - 88e303f6e


[SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between 
label and features vector

fix LabeledPoint parser when there is a whitespace between label and features 
vector, e.g.
(y, [x1, x2, x3])

Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com

Closes #6954 from fe2s/SPARK-8525 and squashes the following commits:

0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep 
on commons-lang
c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when 
there is a whitespace on specific position

(cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/88e303f6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/88e303f6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/88e303f6

Branch: refs/heads/branch-1.3
Commit: 88e303f6e991cdbed519bf11820e1280057581e9
Parents: 716dcf6
Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com
Authored: Tue Jun 23 13:12:19 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Jun 23 13:17:27 2015 -0700

--
 .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++
 .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 +
 .../org/apache/spark/mllib/util/NumericParserSuite.scala  | 7 +++
 3 files changed, 14 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/88e303f6/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index 308f7f3..a841c5c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
 }
   } else if (token == )) {
 parsing = false
+  } else if (token.trim.isEmpty){
+  // ignore whitespaces between delim chars, e.g. , [
   } else {
 // expecting a number
 items.append(parseDouble(token))

http://git-wip-us.apache.org/repos/asf/spark/blob/88e303f6/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index d8364a0..f8d0af8 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -31,6 +31,11 @@ class LabeledPointSuite extends SparkFunSuite {
 }
   }
 
+  test(parse labeled points with whitespaces) {
+val point = LabeledPoint.parse((0.0, [1.0, 2.0]))
+assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+  }
+
   test(parse labeled points with v0.9 format) {
 val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0)
 assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))

http://git-wip-us.apache.org/repos/asf/spark/blob/88e303f6/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index 8dcb9ba..fa4f74d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -37,4 +37,11 @@ class NumericParserSuite extends SparkFunSuite {
   }
 }
   }
+
+  test(parser with whitespaces) {
+val s = (0.0, [1.0, 2.0])
+val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+assert(parsed(0).asInstanceOf[Double] === 0.0)
+assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between label and features vector

2015-06-23 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.2 30789f6ef - 24c2c58c2


[SPARK-8525] [MLLIB] fix LabeledPoint parser when there is a whitespace between 
label and features vector

fix LabeledPoint parser when there is a whitespace between label and features 
vector, e.g.
(y, [x1, x2, x3])

Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com

Closes #6954 from fe2s/SPARK-8525 and squashes the following commits:

0755b9d [Oleksiy Dyagilev] [SPARK-8525][MLLIB] addressing comment, removing dep 
on commons-lang
c1abc2b [Oleksiy Dyagilev] [SPARK-8525][MLLIB] fix LabeledPoint parser when 
there is a whitespace on specific position

(cherry picked from commit a8031183aff2e23de9204ddfc7e7f5edbf052a7e)
Signed-off-by: Xiangrui Meng m...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/24c2c58c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/24c2c58c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/24c2c58c

Branch: refs/heads/branch-1.2
Commit: 24c2c58c28dfcc32d3d4aa2ea089d8bdaaa7ecf3
Parents: 30789f6
Author: Oleksiy Dyagilev oleksiy_dyagi...@epam.com
Authored: Tue Jun 23 13:12:19 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Tue Jun 23 13:17:43 2015 -0700

--
 .../scala/org/apache/spark/mllib/util/NumericParser.scala | 2 ++
 .../org/apache/spark/mllib/regression/LabeledPointSuite.scala | 5 +
 .../org/apache/spark/mllib/util/NumericParserSuite.scala  | 7 +++
 3 files changed, 14 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/24c2c58c/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index f7cba6c..629b0af 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -98,6 +98,8 @@ private[mllib] object NumericParser {
 }
   } else if (token == )) {
 parsing = false
+  } else if (token.trim.isEmpty){
+  // ignore whitespaces between delim chars, e.g. , [
   } else {
 // expecting a number
 items.append(parseDouble(token))

http://git-wip-us.apache.org/repos/asf/spark/blob/24c2c58c/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index 110c44a..fc08bac 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -32,6 +32,11 @@ class LabeledPointSuite extends FunSuite {
 }
   }
 
+  test(parse labeled points with whitespaces) {
+val point = LabeledPoint.parse((0.0, [1.0, 2.0]))
+assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
+  }
+
   test(parse labeled points with v0.9 format) {
 val point = LabeledPoint.parse(1.0,1.0 0.0 -2.0)
 assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))

http://git-wip-us.apache.org/repos/asf/spark/blob/24c2c58c/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index f68fb95..5027311 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -39,4 +39,11 @@ class NumericParserSuite extends FunSuite {
   }
 }
   }
+
+  test(parser with whitespaces) {
+val s = (0.0, [1.0, 2.0])
+val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
+assert(parsed(0).asInstanceOf[Double] === 0.0)
+assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



Git Push Summary

2015-06-23 Thread pwendell
Repository: spark
Updated Tags:  refs/tags/v1.4.1-rc1 [created] 60e08e507

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[1/2] spark git commit: Preparing Spark release v1.4.1-rc1

2015-06-23 Thread pwendell
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 13f7b0a91 - eafbe1345


Preparing Spark release v1.4.1-rc1


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60e08e50
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60e08e50
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60e08e50

Branch: refs/heads/branch-1.4
Commit: 60e08e50751fe3929156de956d62faea79f5b801
Parents: 13f7b0a
Author: Patrick Wendell pwend...@gmail.com
Authored: Tue Jun 23 19:48:39 2015 -0700
Committer: Patrick Wendell pwend...@gmail.com
Committed: Tue Jun 23 19:48:39 2015 -0700

--
 assembly/pom.xml  | 2 +-
 bagel/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 examples/pom.xml  | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/kafka-assembly/pom.xml   | 2 +-
 external/kafka/pom.xml| 2 +-
 external/mqtt/pom.xml | 2 +-
 external/twitter/pom.xml  | 2 +-
 external/zeromq/pom.xml   | 2 +-
 extras/java8-tests/pom.xml| 2 +-
 extras/kinesis-asl/pom.xml| 2 +-
 extras/spark-ganglia-lgpl/pom.xml | 2 +-
 graphx/pom.xml| 2 +-
 launcher/pom.xml  | 2 +-
 mllib/pom.xml | 2 +-
 network/common/pom.xml| 2 +-
 network/shuffle/pom.xml   | 2 +-
 network/yarn/pom.xml  | 2 +-
 pom.xml   | 2 +-
 repl/pom.xml  | 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 unsafe/pom.xml| 2 +-
 yarn/pom.xml  | 2 +-
 30 files changed, 30 insertions(+), 30 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 228db59..ba233e7 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.2-SNAPSHOT/version
+version1.4.1/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/bagel/pom.xml
--
diff --git a/bagel/pom.xml b/bagel/pom.xml
index ce791a6..c5e9183 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.2-SNAPSHOT/version
+version1.4.1/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index 176ea9b..f0d236d 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.2-SNAPSHOT/version
+version1.4.1/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/examples/pom.xml
--
diff --git a/examples/pom.xml b/examples/pom.xml
index 877c2fb..e9a9cc2 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.2-SNAPSHOT/version
+version1.4.1/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index ad431fa..7eae7a7 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.2-SNAPSHOT/version
+version1.4.1/version
 relativePath../../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/60e08e50/external/flume/pom.xml
--
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 9789435..b3ad09a 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 

[2/2] spark git commit: Preparing development version 1.4.2-SNAPSHOT

2015-06-23 Thread pwendell
Preparing development version 1.4.2-SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eafbe134
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eafbe134
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eafbe134

Branch: refs/heads/branch-1.4
Commit: eafbe1345921e90f2099cff809e2d7fe96f197e5
Parents: 60e08e5
Author: Patrick Wendell pwend...@gmail.com
Authored: Tue Jun 23 19:48:44 2015 -0700
Committer: Patrick Wendell pwend...@gmail.com
Committed: Tue Jun 23 19:48:44 2015 -0700

--
 assembly/pom.xml  | 2 +-
 bagel/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 examples/pom.xml  | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/kafka-assembly/pom.xml   | 2 +-
 external/kafka/pom.xml| 2 +-
 external/mqtt/pom.xml | 2 +-
 external/twitter/pom.xml  | 2 +-
 external/zeromq/pom.xml   | 2 +-
 extras/java8-tests/pom.xml| 2 +-
 extras/kinesis-asl/pom.xml| 2 +-
 extras/spark-ganglia-lgpl/pom.xml | 2 +-
 graphx/pom.xml| 2 +-
 launcher/pom.xml  | 2 +-
 mllib/pom.xml | 2 +-
 network/common/pom.xml| 2 +-
 network/shuffle/pom.xml   | 2 +-
 network/yarn/pom.xml  | 2 +-
 pom.xml   | 2 +-
 repl/pom.xml  | 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 unsafe/pom.xml| 2 +-
 yarn/pom.xml  | 2 +-
 30 files changed, 30 insertions(+), 30 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index ba233e7..228db59 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.1/version
+version1.4.2-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/bagel/pom.xml
--
diff --git a/bagel/pom.xml b/bagel/pom.xml
index c5e9183..ce791a6 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.1/version
+version1.4.2-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index f0d236d..176ea9b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.1/version
+version1.4.2-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/examples/pom.xml
--
diff --git a/examples/pom.xml b/examples/pom.xml
index e9a9cc2..877c2fb 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.1/version
+version1.4.2-SNAPSHOT/version
 relativePath../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 7eae7a7..ad431fa 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.1/version
+version1.4.2-SNAPSHOT/version
 relativePath../../pom.xml/relativePath
   /parent
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eafbe134/external/flume/pom.xml
--
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index b3ad09a..9789435 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   parent
 groupIdorg.apache.spark/groupId
 artifactIdspark-parent_2.10/artifactId
-version1.4.1/version
+

spark git commit: [SPARK-6749] [SQL] Make metastore client robust to underlying socket connection loss

2015-06-23 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/master a458efc66 - 50c3a86f4


[SPARK-6749] [SQL] Make metastore client robust to underlying socket connection 
loss

This works around a bug in the underlying RetryingMetaStoreClient (HIVE-10384) 
by refreshing the metastore client on thrift exceptions. We attempt to emulate 
the proper hive behavior by retrying only as configured by hiveconf.

Author: Eric Liang e...@databricks.com

Closes #6912 from ericl/spark-6749 and squashes the following commits:

2d54b55 [Eric Liang] use conf from state
0e3a74e [Eric Liang] use shim properly
980b3e5 [Eric Liang] Fix conf parsing hive 0.14 conf.
92459b6 [Eric Liang] Work around RetryingMetaStoreClient bug


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/50c3a86f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/50c3a86f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/50c3a86f

Branch: refs/heads/master
Commit: 50c3a86f42d7dfd1acbda65c1e5afbd3db1406df
Parents: a458efc
Author: Eric Liang e...@databricks.com
Authored: Tue Jun 23 22:27:17 2015 -0700
Committer: Yin Huai yh...@databricks.com
Committed: Tue Jun 23 22:27:17 2015 -0700

--
 .../spark/sql/hive/client/ClientWrapper.scala   | 55 +++-
 .../apache/spark/sql/hive/client/HiveShim.scala | 19 +++
 2 files changed, 72 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/50c3a86f/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 42c2d4c..2f771d7 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.client
 import java.io.{BufferedReader, InputStreamReader, File, PrintStream}
 import java.net.URI
 import java.util.{ArrayList = JArrayList, Map = JMap, List = JList, Set = 
JSet}
+import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConversions._
 import scala.language.reflectiveCalls
@@ -136,12 +137,62 @@ private[hive] class ClientWrapper(
 
   // TODO: should be a def?s
   // When we create this val client, the HiveConf of it (conf) is the one 
associated with state.
-  private val client = Hive.get(conf)
+  @GuardedBy(this)
+  private var client = Hive.get(conf)
+
+  // We use hive's conf for compatibility.
+  private val retryLimit = 
conf.getIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES)
+  private val retryDelayMillis = 
shim.getMetastoreClientConnectRetryDelayMillis(conf)
+
+  /**
+   * Runs `f` with multiple retries in case the hive metastore is temporarily 
unreachable.
+   */
+  private def retryLocked[A](f: = A): A = synchronized {
+// Hive sometimes retries internally, so set a deadline to avoid 
compounding delays.
+val deadline = System.nanoTime + (retryLimit * retryDelayMillis * 
1e6).toLong
+var numTries = 0
+var caughtException: Exception = null
+do {
+  numTries += 1
+  try {
+return f
+  } catch {
+case e: Exception if causedByThrift(e) =
+  caughtException = e
+  logWarning(
+HiveClientWrapper got thrift exception, destroying client and 
retrying  +
+  s(${retryLimit - numTries} tries remaining), e)
+  Thread.sleep(retryDelayMillis)
+  try {
+client = Hive.get(state.getConf, true)
+  } catch {
+case e: Exception if causedByThrift(e) =
+  logWarning(Failed to refresh hive client, will retry., e)
+  }
+  }
+} while (numTries = retryLimit  System.nanoTime  deadline)
+if (System.nanoTime  deadline) {
+  logWarning(Deadline exceeded)
+}
+throw caughtException
+  }
+
+  private def causedByThrift(e: Throwable): Boolean = {
+var target = e
+while (target != null) {
+  val msg = target.getMessage()
+  if (msg != null  
msg.matches((?s).*(TApplication|TProtocol|TTransport)Exception.*)) {
+return true
+  }
+  target = target.getCause()
+}
+false
+  }
 
   /**
* Runs `f` with ThreadLocal session state and classloaders configured for 
this version of hive.
*/
-  private def withHiveState[A](f: = A): A = synchronized {
+  private def withHiveState[A](f: = A): A = retryLocked {
 val original = Thread.currentThread().getContextClassLoader
 // Set the thread local metastore client to the client associated with 
this ClientWrapper.
 Hive.set(client)


spark git commit: [SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Si…

2015-06-23 Thread tdas
Repository: spark
Updated Branches:
  refs/heads/master 31bd30687 - 9b618fb0d


[SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Si…

…nk. Also bump Flume version to 1.6.0

Author: Hari Shreedharan hshreedha...@apache.org

Closes #6910 from harishreedharan/remove-commons-lang3 and squashes the 
following commits:

9875f7d [Hari Shreedharan] Revert back to Flume 1.4.0
ca35eb0 [Hari Shreedharan] [SPARK-8483][Streaming] Remove commons-lang3 
dependency from Flume Sink. Also bump Flume version to 1.6.0


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9b618fb0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9b618fb0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9b618fb0

Branch: refs/heads/master
Commit: 9b618fb0d2536121d2784ff5341d74723e810fc5
Parents: 31bd306
Author: Hari Shreedharan hshreedha...@apache.org
Authored: Mon Jun 22 23:34:17 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Mon Jun 22 23:34:17 2015 -0700

--
 external/flume-sink/pom.xml  | 4 
 .../spark/streaming/flume/sink/SparkAvroCallbackHandler.scala| 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9b618fb0/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 7a7dccc..0664cfb 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -36,10 +36,6 @@
 
   dependencies
 dependency
-  groupIdorg.apache.commons/groupId
-  artifactIdcommons-lang3/artifactId
-/dependency
-dependency
   groupIdorg.apache.flume/groupId
   artifactIdflume-ng-sdk/artifactId
   exclusions

http://git-wip-us.apache.org/repos/asf/spark/blob/9b618fb0/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
--
diff --git 
a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
 
b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
index dc2a4ab..719fca0 100644
--- 
a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
+++ 
b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
@@ -16,13 +16,13 @@
  */
 package org.apache.spark.streaming.flume.sink
 
+import java.util.UUID
 import java.util.concurrent.{CountDownLatch, Executors}
 import java.util.concurrent.atomic.AtomicLong
 
 import scala.collection.mutable
 
 import org.apache.flume.Channel
-import org.apache.commons.lang3.RandomStringUtils
 
 /**
  * Class that implements the SparkFlumeProtocol, that is used by the Avro 
Netty Server to process
@@ -53,7 +53,7 @@ private[flume] class SparkAvroCallbackHandler(val threads: 
Int, val channel: Cha
   // Since the new txn may not have the same sequence number we must guard 
against accidentally
   // committing a new transaction. To reduce the probability of that happening 
a random string is
   // prepended to the sequence number. Does not change for life of sink
-  private val seqBase = RandomStringUtils.randomAlphanumeric(8)
+  private val seqBase = UUID.randomUUID().toString.substring(0, 8)
   private val seqCounter = new AtomicLong(0)
 
   // Protected by `sequenceNumberToProcessor`


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8359] [SQL] Fix incorrect decimal precision after multiplication

2015-06-23 Thread davies
Repository: spark
Updated Branches:
  refs/heads/master d4f633514 - 31bd30687


[SPARK-8359] [SQL] Fix incorrect decimal precision after multiplication

JIRA: https://issues.apache.org/jira/browse/SPARK-8359

Author: Liang-Chi Hsieh vii...@gmail.com

Closes #6814 from viirya/fix_decimal2 and squashes the following commits:

071a757 [Liang-Chi Hsieh] Remove maximum precision and use 
MathContext.UNLIMITED.
df217d4 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into 
fix_decimal2
a43bfc3 [Liang-Chi Hsieh] Add MathContext with maximum supported precision.
72eeb3f [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into 
fix_decimal2
44c9348 [Liang-Chi Hsieh] Fix incorrect decimal precision after multiplication.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31bd3068
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31bd3068
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31bd3068

Branch: refs/heads/master
Commit: 31bd30687bc29c0e457c37308d489ae2b6e5b72a
Parents: d4f6335
Author: Liang-Chi Hsieh vii...@gmail.com
Authored: Mon Jun 22 23:11:56 2015 -0700
Committer: Davies Liu dav...@databricks.com
Committed: Mon Jun 22 23:11:56 2015 -0700

--
 .../src/main/scala/org/apache/spark/sql/types/Decimal.scala| 6 --
 .../org/apache/spark/sql/types/decimal/DecimalSuite.scala  | 5 +
 2 files changed, 9 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/31bd3068/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index a85af9e..bd9823b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.types
 
+import java.math.{MathContext, RoundingMode}
+
 import org.apache.spark.annotation.DeveloperApi
 
 /**
@@ -137,9 +139,9 @@ final class Decimal extends Ordered[Decimal] with 
Serializable {
 
   def toBigDecimal: BigDecimal = {
 if (decimalVal.ne(null)) {
-  decimalVal
+  decimalVal(MathContext.UNLIMITED)
 } else {
-  BigDecimal(longVal, _scale)
+  BigDecimal(longVal, _scale)(MathContext.UNLIMITED)
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/31bd3068/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index 4c0365c..ccc29c0 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -162,4 +162,9 @@ class DecimalSuite extends SparkFunSuite with 
PrivateMethodTester {
 assert(new Decimal().set(100L, 10, 0).toUnscaledLong === 100L)
 assert(Decimal(Long.MaxValue, 100, 0).toUnscaledLong === Long.MaxValue)
   }
+
+  test(accurate precision after multiplication) {
+val decimal = (Decimal(Long.MaxValue, 38, 0) * Decimal(Long.MaxValue, 38, 
0)).toJavaBigDecimal
+assert(decimal.unscaledValue.toString === 
85070591730234615847396907784232501249)
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8541] [PYSPARK] test the absolute error in approx doctests

2015-06-23 Thread joshrosen
Repository: spark
Updated Branches:
  refs/heads/master 9b618fb0d - f0dcbe8a7


[SPARK-8541] [PYSPARK] test the absolute error in approx doctests

A minor change but one which is (presumably) visible on the public api docs 
webpage.

Author: Scott Taylor git...@megatron.me.uk

Closes #6942 from megatron-me-uk/patch-3 and squashes the following commits:

fbed000 [Scott Taylor] test the absolute error in approx doctests


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f0dcbe8a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f0dcbe8a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f0dcbe8a

Branch: refs/heads/master
Commit: f0dcbe8a7c2de510b47a21eb45cde34777638758
Parents: 9b618fb
Author: Scott Taylor git...@megatron.me.uk
Authored: Mon Jun 22 23:37:56 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Mon Jun 22 23:37:56 2015 -0700

--
 python/pyspark/rdd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f0dcbe8a/python/pyspark/rdd.py
--
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 20c0bc9..1b64be2 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2198,7 +2198,7 @@ class RDD(object):
 
  rdd = sc.parallelize(range(1000), 10)
  r = sum(range(1000))
- (rdd.sumApprox(1000) - r) / r  0.05
+ abs(rdd.sumApprox(1000) - r) / r  0.05
 True
 
 jrdd = self.mapPartitions(lambda it: 
[float(sum(it))])._to_java_object_rdd()
@@ -2215,7 +2215,7 @@ class RDD(object):
 
  rdd = sc.parallelize(range(1000), 10)
  r = sum(range(1000)) / 1000.0
- (rdd.meanApprox(1000) - r) / r  0.05
+ abs(rdd.meanApprox(1000) - r) / r  0.05
 True
 
 jrdd = self.map(float)._to_java_object_rdd()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Sink

2015-06-23 Thread tdas
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 d0943afbc - 929479675


[SPARK-8483] [STREAMING] Remove commons-lang3 dependency from Flume Sink

Author: Hari Shreedharan hshreedha...@apache.org

Closes #6910 from harishreedharan/remove-commons-lang3 and squashes the 
following commits:

9875f7d [Hari Shreedharan] Revert back to Flume 1.4.0
ca35eb0 [Hari Shreedharan] [SPARK-8483][Streaming] Remove commons-lang3 
dependency from Flume Sink. Also bump Flume version to 1.6.0


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92947967
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92947967
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92947967

Branch: refs/heads/branch-1.4
Commit: 9294796750f9c9330ab113f025763e68b624abc9
Parents: d0943af
Author: Hari Shreedharan hshreedha...@apache.org
Authored: Mon Jun 22 23:34:17 2015 -0700
Committer: Tathagata Das tathagata.das1...@gmail.com
Committed: Mon Jun 22 23:41:35 2015 -0700

--
 external/flume-sink/pom.xml  | 4 
 .../spark/streaming/flume/sink/SparkAvroCallbackHandler.scala| 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/92947967/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 37f2b1b..ad431fa 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -36,10 +36,6 @@
 
   dependencies
 dependency
-  groupIdorg.apache.commons/groupId
-  artifactIdcommons-lang3/artifactId
-/dependency
-dependency
   groupIdorg.apache.flume/groupId
   artifactIdflume-ng-sdk/artifactId
   exclusions

http://git-wip-us.apache.org/repos/asf/spark/blob/92947967/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
--
diff --git 
a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
 
b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
index dc2a4ab..719fca0 100644
--- 
a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
+++ 
b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
@@ -16,13 +16,13 @@
  */
 package org.apache.spark.streaming.flume.sink
 
+import java.util.UUID
 import java.util.concurrent.{CountDownLatch, Executors}
 import java.util.concurrent.atomic.AtomicLong
 
 import scala.collection.mutable
 
 import org.apache.flume.Channel
-import org.apache.commons.lang3.RandomStringUtils
 
 /**
  * Class that implements the SparkFlumeProtocol, that is used by the Avro 
Netty Server to process
@@ -53,7 +53,7 @@ private[flume] class SparkAvroCallbackHandler(val threads: 
Int, val channel: Cha
   // Since the new txn may not have the same sequence number we must guard 
against accidentally
   // committing a new transaction. To reduce the probability of that happening 
a random string is
   // prepended to the sequence number. Does not change for life of sink
-  private val seqBase = RandomStringUtils.randomAlphanumeric(8)
+  private val seqBase = UUID.randomUUID().toString.substring(0, 8)
   private val seqCounter = new AtomicLong(0)
 
   // Protected by `sequenceNumberToProcessor`


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8431] [SPARKR] Add in operator to DataFrame Column in SparkR

2015-06-23 Thread davies
Repository: spark
Updated Branches:
  refs/heads/master 164fe2aa4 - d4f633514


[SPARK-8431] [SPARKR] Add in operator to DataFrame Column in SparkR

[[SPARK-8431] Add in operator to DataFrame Column in SparkR - ASF 
JIRA](https://issues.apache.org/jira/browse/SPARK-8431)

Author: Yu ISHIKAWA yuu.ishik...@gmail.com

Closes #6941 from yu-iskw/SPARK-8431 and squashes the following commits:

1f64423 [Yu ISHIKAWA] Modify the comment
f4309a7 [Yu ISHIKAWA] Make a `setMethod` for `%in%` be independent
6e37936 [Yu ISHIKAWA] Modify a variable name
c196173 [Yu ISHIKAWA] [SPARK-8431][SparkR] Add in operator to DataFrame Column 
in SparkR


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4f63351
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4f63351
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4f63351

Branch: refs/heads/master
Commit: d4f633514a393320c9ae64c00a75f702e6f58c67
Parents: 164fe2a
Author: Yu ISHIKAWA yuu.ishik...@gmail.com
Authored: Mon Jun 22 23:04:36 2015 -0700
Committer: Davies Liu dav...@databricks.com
Committed: Mon Jun 22 23:04:36 2015 -0700

--
 R/pkg/R/column.R | 16 
 R/pkg/inst/tests/test_sparkSQL.R | 10 ++
 2 files changed, 26 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d4f63351/R/pkg/R/column.R
--
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 80e92d3..8e4b0f5 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -210,6 +210,22 @@ setMethod(cast,
 }
   })
 
+#' Match a column with given values.
+#'
+#' @rdname column
+#' @return a matched values as a result of comparing with given values.
+#' \dontrun{
+#'   filter(df, age in (10, 30))
+#'   where(df, df$age %in% c(10, 30))
+#' }
+setMethod(%in%,
+  signature(x = Column),
+  function(x, table) {
+table - listToSeq(as.list(table))
+jc - callJMethod(x@jc, in, table)
+return(column(jc))
+  })
+
 #' Approx Count Distinct
 #'
 #' @rdname column

http://git-wip-us.apache.org/repos/asf/spark/blob/d4f63351/R/pkg/inst/tests/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index fc7f3f0..417153d 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -693,6 +693,16 @@ test_that(filter() on a DataFrame, {
   filtered2 - where(df, df$name != Michael)
   expect_true(count(filtered2) == 2)
   expect_true(collect(filtered2)$age[2] == 19)
+
+  # test suites for %in%
+  filtered3 - filter(df, age in (19))
+  expect_equal(count(filtered3), 1)
+  filtered4 - filter(df, age in (19, 30))
+  expect_equal(count(filtered4), 2)
+  filtered5 - where(df, df$age %in% c(19))
+  expect_equal(count(filtered5), 1)
+  filtered6 - where(df, df$age %in% c(19, 30))
+  expect_equal(count(filtered6), 2)
 })
 
 test_that(join() on a DataFrame, {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8541] [PYSPARK] test the absolute error in approx doctests

2015-06-23 Thread joshrosen
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 22cc1ab66 - d0943afbc


[SPARK-8541] [PYSPARK] test the absolute error in approx doctests

A minor change but one which is (presumably) visible on the public api docs 
webpage.

Author: Scott Taylor git...@megatron.me.uk

Closes #6942 from megatron-me-uk/patch-3 and squashes the following commits:

fbed000 [Scott Taylor] test the absolute error in approx doctests

(cherry picked from commit f0dcbe8a7c2de510b47a21eb45cde34777638758)
Signed-off-by: Josh Rosen joshro...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0943afb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0943afb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0943afb

Branch: refs/heads/branch-1.4
Commit: d0943afbcffec5d8b668794dedc8d85fb10b0596
Parents: 22cc1ab
Author: Scott Taylor git...@megatron.me.uk
Authored: Mon Jun 22 23:37:56 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Mon Jun 22 23:38:21 2015 -0700

--
 python/pyspark/rdd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d0943afb/python/pyspark/rdd.py
--
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 20c0bc9..1b64be2 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2198,7 +2198,7 @@ class RDD(object):
 
  rdd = sc.parallelize(range(1000), 10)
  r = sum(range(1000))
- (rdd.sumApprox(1000) - r) / r  0.05
+ abs(rdd.sumApprox(1000) - r) / r  0.05
 True
 
 jrdd = self.mapPartitions(lambda it: 
[float(sum(it))])._to_java_object_rdd()
@@ -2215,7 +2215,7 @@ class RDD(object):
 
  rdd = sc.parallelize(range(1000), 10)
  r = sum(range(1000)) / 1000.0
- (rdd.meanApprox(1000) - r) / r  0.05
+ abs(rdd.meanApprox(1000) - r) / r  0.05
 True
 
 jrdd = self.map(float)._to_java_object_rdd()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8541] [PYSPARK] test the absolute error in approx doctests

2015-06-23 Thread joshrosen
Repository: spark
Updated Branches:
  refs/heads/branch-1.3 45b4527e3 - 716dcf631


[SPARK-8541] [PYSPARK] test the absolute error in approx doctests

A minor change but one which is (presumably) visible on the public api docs 
webpage.

Author: Scott Taylor git...@megatron.me.uk

Closes #6942 from megatron-me-uk/patch-3 and squashes the following commits:

fbed000 [Scott Taylor] test the absolute error in approx doctests

(cherry picked from commit f0dcbe8a7c2de510b47a21eb45cde34777638758)
Signed-off-by: Josh Rosen joshro...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/716dcf63
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/716dcf63
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/716dcf63

Branch: refs/heads/branch-1.3
Commit: 716dcf631558920c080cb824dcd617789b9f96d5
Parents: 45b4527
Author: Scott Taylor git...@megatron.me.uk
Authored: Mon Jun 22 23:37:56 2015 -0700
Committer: Josh Rosen joshro...@databricks.com
Committed: Mon Jun 22 23:39:39 2015 -0700

--
 python/pyspark/rdd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/716dcf63/python/pyspark/rdd.py
--
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index d80366a..bd18cb3 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2130,7 +2130,7 @@ class RDD(object):
 
  rdd = sc.parallelize(range(1000), 10)
  r = sum(xrange(1000))
- (rdd.sumApprox(1000) - r) / r  0.05
+ abs(rdd.sumApprox(1000) - r) / r  0.05
 True
 
 jrdd = self.mapPartitions(lambda it: 
[float(sum(it))])._to_java_object_rdd()
@@ -2147,7 +2147,7 @@ class RDD(object):
 
  rdd = sc.parallelize(range(1000), 10)
  r = sum(xrange(1000)) / 1000.0
- (rdd.meanApprox(1000) - r) / r  0.05
+ abs(rdd.meanApprox(1000) - r) / r  0.05
 True
 
 jrdd = self.map(float)._to_java_object_rdd()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8300] DataFrame hint for broadcast join.

2015-06-23 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/master f0dcbe8a7 - 6ceb16960


[SPARK-8300] DataFrame hint for broadcast join.

Users can now do
```scala
left.join(broadcast(right), joinKey)
```
to give the query planner a hint that right DataFrame is small and should be 
broadcasted.

Author: Reynold Xin r...@databricks.com

Closes #6751 from rxin/broadcastjoin-hint and squashes the following commits:

953eec2 [Reynold Xin] Code review feedback.
88752d8 [Reynold Xin] Fixed import.
8187b88 [Reynold Xin] [SPARK-8300] DataFrame hint for broadcast join.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ceb1696
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ceb1696
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ceb1696

Branch: refs/heads/master
Commit: 6ceb169608428a651d53c93bf73ca5ac53a6bde2
Parents: f0dcbe8
Author: Reynold Xin r...@databricks.com
Authored: Tue Jun 23 01:50:31 2015 -0700
Committer: Michael Armbrust mich...@databricks.com
Committed: Tue Jun 23 01:50:31 2015 -0700

--
 .../catalyst/plans/logical/basicOperators.scala |  8 +++
 .../spark/sql/execution/SparkStrategies.scala   | 25 +---
 .../scala/org/apache/spark/sql/functions.scala  | 17 +
 .../apache/spark/sql/DataFrameJoinSuite.scala   | 17 +
 4 files changed, 59 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6ceb1696/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index f8e5916..7814e51 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -130,6 +130,14 @@ case class Join(
   }
 }
 
+/**
+ * A hint for the optimizer that we should broadcast the `child` if used in a 
join operator.
+ */
+case class BroadcastHint(child: LogicalPlan) extends UnaryNode {
+  override def output: Seq[Attribute] = child.output
+}
+
+
 case class Except(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/6ceb1696/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 422992d..5c420eb 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, 
InMemoryRelation}
 import org.apache.spark.sql.execution.{DescribeCommand = 
RunnableDescribeCommand}
@@ -53,6 +53,18 @@ private[sql] abstract class SparkStrategies extends 
QueryPlanner[SparkPlan] {
   }
 
   /**
+   * Matches a plan whose output should be small enough to be used in 
broadcast join.
+   */
+  object CanBroadcast {
+def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match {
+  case BroadcastHint(p) = Some(p)
+  case p if sqlContext.conf.autoBroadcastJoinThreshold  0 
+p.statistics.sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold 
= Some(p)
+  case _ = None
+}
+  }
+
+  /**
* Uses the ExtractEquiJoinKeys pattern to find joins where at least some of 
the predicates can be
* evaluated by matching hash keys.
*
@@ -80,15 +92,11 @@ private[sql] abstract class SparkStrategies extends 
QueryPlanner[SparkPlan] {
 }
 
 def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-  case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, 
right)
-if sqlContext.conf.autoBroadcastJoinThreshold  0 
-   right.statistics.sizeInBytes = 
sqlContext.conf.autoBroadcastJoinThreshold =
+  case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, 
CanBroadcast(right)) =
 

spark git commit: Revert [SPARK-7157][SQL] add sampleBy to DataFrame

2015-06-23 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master 0401cbaa8 - a458efc66


Revert [SPARK-7157][SQL] add sampleBy to DataFrame

This reverts commit 0401cbaa8ee51c71f43604f338b65022a479da0a.

The new test case on Jenkins is failing.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a458efc6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a458efc6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a458efc6

Branch: refs/heads/master
Commit: a458efc66c31dc281af379b914bfa2b077ca6635
Parents: 0401cba
Author: Reynold Xin r...@databricks.com
Authored: Tue Jun 23 19:30:25 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Tue Jun 23 19:30:25 2015 -0700

--
 python/pyspark/sql/dataframe.py | 40 
 .../spark/sql/DataFrameStatFunctions.scala  | 24 
 .../apache/spark/sql/DataFrameStatSuite.scala   | 12 +-
 3 files changed, 2 insertions(+), 74 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a458efc6/python/pyspark/sql/dataframe.py
--
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 213338d..152b873 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -448,41 +448,6 @@ class DataFrame(object):
 rdd = self._jdf.sample(withReplacement, fraction, long(seed))
 return DataFrame(rdd, self.sql_ctx)
 
-@since(1.5)
-def sampleBy(self, col, fractions, seed=None):
-
-Returns a stratified sample without replacement based on the
-fraction given on each stratum.
-
-:param col: column that defines strata
-:param fractions:
-sampling fraction for each stratum. If a stratum is not
-specified, we treat its fraction as zero.
-:param seed: random seed
-:return: a new DataFrame that represents the stratified sample
-
- from pyspark.sql.functions import col
- dataset = sqlContext.range(0, 100).select((col(id) % 
3).alias(key))
- sampled = dataset.sampleBy(key, fractions={0: 0.1, 1: 0.2}, 
seed=0)
- sampled.groupBy(key).count().orderBy(key).show()
-+---+-+
-|key|count|
-+---+-+
-|  0|5|
-|  1|8|
-+---+-+
-
-if not isinstance(col, str):
-raise ValueError(col must be a string, but got %r % type(col))
-if not isinstance(fractions, dict):
-raise ValueError(fractions must be a dict but got %r % 
type(fractions))
-for k, v in fractions.items():
-if not isinstance(k, (float, int, long, basestring)):
-raise ValueError(key must be float, int, long, or string, but 
got %r % type(k))
-fractions[k] = float(v)
-seed = seed if seed is not None else random.randint(0, sys.maxsize)
-return DataFrame(self._jdf.stat().sampleBy(col, self._jmap(fractions), 
seed), self.sql_ctx)
-
 @since(1.4)
 def randomSplit(self, weights, seed=None):
 Randomly splits this :class:`DataFrame` with the provided weights.
@@ -1357,11 +1322,6 @@ class DataFrameStatFunctions(object):
 
 freqItems.__doc__ = DataFrame.freqItems.__doc__
 
-def sampleBy(self, col, fractions, seed=None):
-return self.df.sampleBy(col, fractions, seed)
-
-sampleBy.__doc__ = DataFrame.sampleBy.__doc__
-
 
 def _test():
 import doctest

http://git-wip-us.apache.org/repos/asf/spark/blob/a458efc6/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 955d287..edb9ed7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import java.util.UUID
-
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.execution.stat._
 
@@ -165,26 +163,4 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
   def freqItems(cols: Seq[String]): DataFrame = {
 FrequentItems.singlePassFreqItems(df, cols, 0.01)
   }
-
-  /**
-   * Returns a stratified sample without replacement based on the fraction 
given on each stratum.
-   * @param col column that defines strata
-   * @param fractions sampling fraction for each stratum. If a stratum is not 
specified, we treat
-   *  its fraction as zero.
-   * @param seed random seed
-   * @return a new [[DataFrame]] that 

Git Push Summary

2015-06-23 Thread pwendell
Repository: spark
Updated Tags:  refs/tags/v1.4.1-rc1 [deleted] d0a5560ce

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in booelan expression

2015-06-23 Thread davies
Repository: spark
Updated Branches:
  refs/heads/branch-1.4 8d6e3636e - 13f7b0a91


[SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in 
booelan expression

It's a common mistake that user will put Column in a boolean expression 
(together with `and` , `or`), which does not work as expected, we should raise 
a exception in that case, and suggest user to use ``, `|` instead.

Author: Davies Liu dav...@databricks.com

Closes #6961 from davies/column_bool and squashes the following commits:

9f19beb [Davies Liu] update message
af74bd6 [Davies Liu] fix tests
07dff84 [Davies Liu] address comments, fix tests
f70c08e [Davies Liu] raise Exception if column is used in booelan expression

(cherry picked from commit 7fb5ae5024284593204779ff463bfbdb4d1c6da5)
Signed-off-by: Davies Liu dav...@databricks.com


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/13f7b0a9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/13f7b0a9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/13f7b0a9

Branch: refs/heads/branch-1.4
Commit: 13f7b0a910557fb5564636031f43c2674a7dfa28
Parents: 8d6e363
Author: Davies Liu dav...@databricks.com
Authored: Tue Jun 23 15:51:16 2015 -0700
Committer: Davies Liu dav...@databricks.com
Committed: Tue Jun 23 15:51:35 2015 -0700

--
 python/pyspark/sql/column.py |  5 +
 python/pyspark/sql/tests.py  | 10 +-
 2 files changed, 14 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/13f7b0a9/python/pyspark/sql/column.py
--
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 1ecec5b..0a85da7 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -396,6 +396,11 @@ class Column(object):
 jc = self._jc.over(window._jspec)
 return Column(jc)
 
+def __nonzero__(self):
+raise ValueError(Cannot convert column into bool: please use '' for 
'and', '|' for 'or', 
+ '~' for 'not' when building DataFrame boolean 
expressions.)
+__bool__ = __nonzero__
+
 def __repr__(self):
 return 'Column%s' % self._jc.toString().encode('utf8')
 

http://git-wip-us.apache.org/repos/asf/spark/blob/13f7b0a9/python/pyspark/sql/tests.py
--
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 4e72407..5c25890 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -149,6 +149,14 @@ class SQLTests(ReusedPySparkTestCase):
 self.assertEqual(result[0][0], a)
 self.assertEqual(result[0][1], b)
 
+def test_and_in_expression(self):
+self.assertEqual(4, self.df.filter((self.df.key = 10)  
(self.df.value = 2)).count())
+self.assertRaises(ValueError, lambda: (self.df.key = 10) and 
(self.df.value = 2))
+self.assertEqual(14, self.df.filter((self.df.key = 3) | 
(self.df.value  2)).count())
+self.assertRaises(ValueError, lambda: self.df.key = 3 or 
self.df.value  2)
+self.assertEqual(99, self.df.filter(~(self.df.key == 1)).count())
+self.assertRaises(ValueError, lambda: not self.df.key == 1)
+
 def test_udf_with_callable(self):
 d = [Row(number=i, squared=i**2) for i in range(10)]
 rdd = self.sc.parallelize(d)
@@ -393,7 +401,7 @@ class SQLTests(ReusedPySparkTestCase):
 self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column))
 rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci)
 self.assertTrue(all(isinstance(c, Column) for c in rcc))
-cb = [ci == 5, ci != 0, ci  3, ci  4, ci = 0, ci = 7, ci and cs, 
ci or cs]
+cb = [ci == 5, ci != 0, ci  3, ci  4, ci = 0, ci = 7]
 self.assertTrue(all(isinstance(c, Column) for c in cb))
 cbool = (ci  ci), (ci | ci), (~ci)
 self.assertTrue(all(isinstance(c, Column) for c in cbool))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8139] [SQL] Updates docs and comments of data sources and Parquet output committer options

2015-06-23 Thread lian
Repository: spark
Updated Branches:
  refs/heads/master 7fb5ae502 - 111d6b9b8


[SPARK-8139] [SQL] Updates docs and comments of data sources and Parquet output 
committer options

This PR only applies to master branch (1.5.0-SNAPSHOT) since it references 
`org.apache.parquet` classes which only appear in Parquet 1.7.0.

Author: Cheng Lian l...@databricks.com

Closes #6683 from liancheng/output-committer-docs and squashes the following 
commits:

b4648b8 [Cheng Lian] Removes spark.sql.sources.outputCommitterClass as it's not 
a public option
ee63923 [Cheng Lian] Updates docs and comments of data sources and Parquet 
output committer options


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/111d6b9b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/111d6b9b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/111d6b9b

Branch: refs/heads/master
Commit: 111d6b9b8a584b962b6ae80c7aa8c45845ce0099
Parents: 7fb5ae5
Author: Cheng Lian l...@databricks.com
Authored: Tue Jun 23 17:24:26 2015 -0700
Committer: Cheng Lian l...@databricks.com
Committed: Tue Jun 23 17:24:26 2015 -0700

--
 docs/sql-programming-guide.md   | 30 -
 .../scala/org/apache/spark/sql/SQLConf.scala| 30 +
 .../parquet/DirectParquetOutputCommitter.scala  | 34 ++--
 .../apache/spark/sql/parquet/newParquet.scala   |  4 +--
 4 files changed, 78 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/111d6b9b/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 9107c9b..2786e3d 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1348,6 +1348,34 @@ Configuration of Parquet can be done using the `setConf` 
method on `SQLContext`
 support.
   /td
 /tr
+tr
+  tdcodespark.sql.parquet.output.committer.class/code/td
+  tdcodeorg.apache.parquet.hadoop.br /ParquetOutputCommitter/code/td
+  td
+p
+  The output committer class used by Parquet. The specified class needs to 
be a subclass of
+  codeorg.apache.hadoop.br /mapreduce.OutputCommitter/code.  
Typically, it's also a
+  subclass of 
codeorg.apache.parquet.hadoop.ParquetOutputCommitter/code.
+/p
+p
+  bNote:/b
+  ul
+li
+  This option must be set via Hadoop codeConfiguration/code rather 
than Spark
+  codeSQLConf/code.
+/li
+li
+  This option overrides codespark.sql.sources.br 
/outputCommitterClass/code.
+/li
+  /ul
+/p
+p
+  Spark SQL comes with a builtin
+  codeorg.apache.spark.sql.br 
/parquet.DirectParquetOutputCommitter/code, which can be more
+  efficient then the default Parquet output committer when writing data to 
S3.
+/p
+  /td
+/tr
 /table
 
 ## JSON Datasets
@@ -1876,7 +1904,7 @@ that these options will be deprecated in future release 
as more optimizations ar
   Configures the number of partitions to use when shuffling data for joins 
or aggregations.
 /td
   /tr
-   tr
+  tr
 tdcodespark.sql.planner.externalSort/code/td
 tdfalse/td
 td

http://git-wip-us.apache.org/repos/asf/spark/blob/111d6b9b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 16493c3..2653526 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -22,6 +22,8 @@ import java.util.Properties
 import scala.collection.immutable
 import scala.collection.JavaConversions._
 
+import org.apache.parquet.hadoop.ParquetOutputCommitter
+
 import org.apache.spark.sql.catalyst.CatalystConf
 
 private[spark] object SQLConf {
@@ -252,9 +254,9 @@ private[spark] object SQLConf {
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = 
booleanConf(spark.sql.parquet.filterPushdown,
 defaultValue = Some(false),
-doc = Turn on Parquet filter pushdown optimization. This feature is 
turned off by default +
-   because of a known bug in Paruet 1.6.0rc3  +
-  (a 
href=\https://issues.apache.org/jira/browse/PARQUET-136\;PARQUET-136/a). 
However,  +
+doc = Turn on Parquet filter pushdown optimization. This feature is 
turned off by default  +
+  because of a known bug in Parquet 1.6.0rc3  +
+  (PARQUET-136, https://issues.apache.org/jira/browse/PARQUET-136). 
However,  +
   if your table doesn't contain any nullable string or binary columns, 
it's still safe to  +
   turn this feature on.)
 
@@ -262,11 +264,21 @@ private[spark] 

spark git commit: [SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in booelan expression

2015-06-23 Thread davies
Repository: spark
Updated Branches:
  refs/heads/master d96d7b557 - 7fb5ae502


[SPARK-8573] [SPARK-8568] [SQL] [PYSPARK] raise Exception if column is used in 
booelan expression

It's a common mistake that user will put Column in a boolean expression 
(together with `and` , `or`), which does not work as expected, we should raise 
a exception in that case, and suggest user to use ``, `|` instead.

Author: Davies Liu dav...@databricks.com

Closes #6961 from davies/column_bool and squashes the following commits:

9f19beb [Davies Liu] update message
af74bd6 [Davies Liu] fix tests
07dff84 [Davies Liu] address comments, fix tests
f70c08e [Davies Liu] raise Exception if column is used in booelan expression


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7fb5ae50
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7fb5ae50
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7fb5ae50

Branch: refs/heads/master
Commit: 7fb5ae5024284593204779ff463bfbdb4d1c6da5
Parents: d96d7b5
Author: Davies Liu dav...@databricks.com
Authored: Tue Jun 23 15:51:16 2015 -0700
Committer: Davies Liu dav...@databricks.com
Committed: Tue Jun 23 15:51:16 2015 -0700

--
 python/pyspark/sql/column.py |  5 +
 python/pyspark/sql/tests.py  | 10 +-
 2 files changed, 14 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7fb5ae50/python/pyspark/sql/column.py
--
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 1ecec5b..0a85da7 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -396,6 +396,11 @@ class Column(object):
 jc = self._jc.over(window._jspec)
 return Column(jc)
 
+def __nonzero__(self):
+raise ValueError(Cannot convert column into bool: please use '' for 
'and', '|' for 'or', 
+ '~' for 'not' when building DataFrame boolean 
expressions.)
+__bool__ = __nonzero__
+
 def __repr__(self):
 return 'Column%s' % self._jc.toString().encode('utf8')
 

http://git-wip-us.apache.org/repos/asf/spark/blob/7fb5ae50/python/pyspark/sql/tests.py
--
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 13f4556..e6a434e 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -164,6 +164,14 @@ class SQLTests(ReusedPySparkTestCase):
 self.assertEqual(result[0][0], a)
 self.assertEqual(result[0][1], b)
 
+def test_and_in_expression(self):
+self.assertEqual(4, self.df.filter((self.df.key = 10)  
(self.df.value = 2)).count())
+self.assertRaises(ValueError, lambda: (self.df.key = 10) and 
(self.df.value = 2))
+self.assertEqual(14, self.df.filter((self.df.key = 3) | 
(self.df.value  2)).count())
+self.assertRaises(ValueError, lambda: self.df.key = 3 or 
self.df.value  2)
+self.assertEqual(99, self.df.filter(~(self.df.key == 1)).count())
+self.assertRaises(ValueError, lambda: not self.df.key == 1)
+
 def test_udf_with_callable(self):
 d = [Row(number=i, squared=i**2) for i in range(10)]
 rdd = self.sc.parallelize(d)
@@ -408,7 +416,7 @@ class SQLTests(ReusedPySparkTestCase):
 self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column))
 rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci)
 self.assertTrue(all(isinstance(c, Column) for c in rcc))
-cb = [ci == 5, ci != 0, ci  3, ci  4, ci = 0, ci = 7, ci and cs, 
ci or cs]
+cb = [ci == 5, ci != 0, ci  3, ci  4, ci = 0, ci = 7]
 self.assertTrue(all(isinstance(c, Column) for c in cb))
 cbool = (ci  ci), (ci | ci), (~ci)
 self.assertTrue(all(isinstance(c, Column) for c in cbool))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org