[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread JoshRosen
Github user JoshRosen commented on the pull request:

https://github.com/apache/spark/pull/7635#issuecomment-124339199
  
/cc @rxin @davies


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SPARK-9264] [SQL] remove unlimit...

2015-07-24 Thread davies
Github user davies commented on a diff in the pull request:

https://github.com/apache/spark/pull/7605#discussion_r35397614
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 ---
@@ -334,144 +334,94 @@ object HiveTypeCoercion {
* - SHORT gets turned into DECIMAL(5, 0)
* - INT gets turned into DECIMAL(10, 0)
* - LONG gets turned into DECIMAL(20, 0)
-   * - FLOAT and DOUBLE
-   *   1. Union, Intersect and Except operations:
-   *  FLOAT gets turned into DECIMAL(7, 7), DOUBLE gets turned into 
DECIMAL(15, 15) (this is the
-   *  same as Hive)
-   *   2. Other operation:
-   *  FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE 
(this is the same as Hive,
-   *   but note that unlimited decimals are considered bigger than doubles 
in WidenTypes)
+   * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE
+   *
+   * Note: Union/Except/Interact is handled by WidenTypes
*/
   // scalastyle:on
   object DecimalPrecision extends Rule[LogicalPlan] {
 import scala.math.{max, min}
 
-// Conversion rules for integer types into fixed-precision decimals
-private val intTypeToFixed: Map[DataType, DecimalType] = Map(
-  ByteType - DecimalType(3, 0),
-  ShortType - DecimalType(5, 0),
-  IntegerType - DecimalType(10, 0),
-  LongType - DecimalType(20, 0)
-)
-
 private def isFloat(t: DataType): Boolean = t == FloatType || t == 
DoubleType
 
-// Conversion rules for float and double into fixed-precision decimals
-private val floatTypeToFixed: Map[DataType, DecimalType] = Map(
-  FloatType - DecimalType(7, 7),
-  DoubleType - DecimalType(15, 15)
-)
-
-private def castDecimalPrecision(
-left: LogicalPlan,
-right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
-  val castedInput = left.output.zip(right.output).map {
-case (lhs, rhs) if lhs.dataType != rhs.dataType =
-  (lhs.dataType, rhs.dataType) match {
-case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) =
-  // Decimals with precision/scale p1/s2 and p2/s2  will be 
promoted to
-  // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2))
-  val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - 
s2), max(s1, s2))
-  (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, 
fixedType), rhs.name)())
-case (t, DecimalType.Fixed(p, s)) if 
intTypeToFixed.contains(t) =
-  (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs)
-case (DecimalType.Fixed(p, s), t) if 
intTypeToFixed.contains(t) =
-  (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)())
-case (t, DecimalType.Fixed(p, s)) if 
floatTypeToFixed.contains(t) =
-  (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs)
-case (DecimalType.Fixed(p, s), t) if 
floatTypeToFixed.contains(t) =
-  (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)())
-case _ = (lhs, rhs)
-  }
-case other = other
-  }
-
-  val (castedLeft, castedRight) = castedInput.unzip
+// Returns the wider decimal type that's wider than both of them
+def widerDecimalType(d1: DecimalType, d2: DecimalType): DecimalType = {
+  widerDecimalType(d1.precision, d1.scale, d2.precision, d2.scale)
+}
+// max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
+def widerDecimalType(p1: Int, s1: Int, p2: Int, s2: Int): DecimalType 
= {
+  val scale = max(s1, s2)
+  val range = max(p1 - s1, p2 - s2)
+  DecimalType.bounded(range + scale, scale)
+}
 
-  val newLeft =
-if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-  Project(castedLeft, left)
-} else {
-  left
-}
+/**
+ * An expression used to wrap the children when promote the precision 
of DecimalType to avoid
+ * promote multiple times.
+ */
+case class ChangePrecision(child: Expression) extends UnaryExpression {
+  override def dataType: DataType = child.dataType
+  override def eval(input: InternalRow): Any = child.eval(input)
+  override def gen(ctx: CodeGenContext): GeneratedExpressionCode = 
child.gen(ctx)
+  override protected def genCode(ctx: CodeGenContext, ev: 
GeneratedExpressionCode): String = 
+  override def prettyName: String = change_precision
+}
 
-  val newRight =
-if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-  Project(castedRight, right)
-} 

[GitHub] spark pull request: [SPARK-9069] [SPARK-9264] [SQL] remove unlimit...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7605#discussion_r35397548
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 ---
@@ -334,144 +334,94 @@ object HiveTypeCoercion {
* - SHORT gets turned into DECIMAL(5, 0)
* - INT gets turned into DECIMAL(10, 0)
* - LONG gets turned into DECIMAL(20, 0)
-   * - FLOAT and DOUBLE
-   *   1. Union, Intersect and Except operations:
-   *  FLOAT gets turned into DECIMAL(7, 7), DOUBLE gets turned into 
DECIMAL(15, 15) (this is the
-   *  same as Hive)
-   *   2. Other operation:
-   *  FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE 
(this is the same as Hive,
-   *   but note that unlimited decimals are considered bigger than doubles 
in WidenTypes)
+   * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE
+   *
+   * Note: Union/Except/Interact is handled by WidenTypes
*/
   // scalastyle:on
   object DecimalPrecision extends Rule[LogicalPlan] {
 import scala.math.{max, min}
 
-// Conversion rules for integer types into fixed-precision decimals
-private val intTypeToFixed: Map[DataType, DecimalType] = Map(
-  ByteType - DecimalType(3, 0),
-  ShortType - DecimalType(5, 0),
-  IntegerType - DecimalType(10, 0),
-  LongType - DecimalType(20, 0)
-)
-
 private def isFloat(t: DataType): Boolean = t == FloatType || t == 
DoubleType
 
-// Conversion rules for float and double into fixed-precision decimals
-private val floatTypeToFixed: Map[DataType, DecimalType] = Map(
-  FloatType - DecimalType(7, 7),
-  DoubleType - DecimalType(15, 15)
-)
-
-private def castDecimalPrecision(
-left: LogicalPlan,
-right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
-  val castedInput = left.output.zip(right.output).map {
-case (lhs, rhs) if lhs.dataType != rhs.dataType =
-  (lhs.dataType, rhs.dataType) match {
-case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) =
-  // Decimals with precision/scale p1/s2 and p2/s2  will be 
promoted to
-  // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2))
-  val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - 
s2), max(s1, s2))
-  (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, 
fixedType), rhs.name)())
-case (t, DecimalType.Fixed(p, s)) if 
intTypeToFixed.contains(t) =
-  (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs)
-case (DecimalType.Fixed(p, s), t) if 
intTypeToFixed.contains(t) =
-  (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)())
-case (t, DecimalType.Fixed(p, s)) if 
floatTypeToFixed.contains(t) =
-  (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs)
-case (DecimalType.Fixed(p, s), t) if 
floatTypeToFixed.contains(t) =
-  (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)())
-case _ = (lhs, rhs)
-  }
-case other = other
-  }
-
-  val (castedLeft, castedRight) = castedInput.unzip
+// Returns the wider decimal type that's wider than both of them
+def widerDecimalType(d1: DecimalType, d2: DecimalType): DecimalType = {
+  widerDecimalType(d1.precision, d1.scale, d2.precision, d2.scale)
+}
+// max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
+def widerDecimalType(p1: Int, s1: Int, p2: Int, s2: Int): DecimalType 
= {
+  val scale = max(s1, s2)
+  val range = max(p1 - s1, p2 - s2)
+  DecimalType.bounded(range + scale, scale)
+}
 
-  val newLeft =
-if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-  Project(castedLeft, left)
-} else {
-  left
-}
+/**
+ * An expression used to wrap the children when promote the precision 
of DecimalType to avoid
+ * promote multiple times.
+ */
+case class ChangePrecision(child: Expression) extends UnaryExpression {
+  override def dataType: DataType = child.dataType
+  override def eval(input: InternalRow): Any = child.eval(input)
+  override def gen(ctx: CodeGenContext): GeneratedExpressionCode = 
child.gen(ctx)
+  override protected def genCode(ctx: CodeGenContext, ev: 
GeneratedExpressionCode): String = 
+  override def prettyName: String = change_precision
+}
 
-  val newRight =
-if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-  Project(castedRight, right)
-} 

[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7635#issuecomment-124339249
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7583#issuecomment-124339269
  
  [Test build #38305 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38305/console)
 for   PR 7583 at commit 
[`d6326d8`](https://github.com/apache/spark/commit/d6326d8d260f6ec125248e0eb25026b554248034).
 * This patch **passes all tests**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7635#issuecomment-124339270
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124339271
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124339252
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7583#issuecomment-124339334
  
Merged build finished. Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9222] [MLlib] Make class instantiation ...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7573#issuecomment-124342620
  
  [Test build #38313 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38313/console)
 for   PR 7573 at commit 
[`2f1a293`](https://github.com/apache/spark/commit/2f1a29312f2c5ee74ba416243a8724707b034071).
 * This patch **passes all tests**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread JoshRosen
Github user JoshRosen commented on a diff in the pull request:

https://github.com/apache/spark/pull/7635#discussion_r35398027
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala ---
@@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with 
Serializable {
 this
   }
 
-  def toBigDecimal: BigDecimal = {
+  def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal)
--- End diff --

If we're in agreement that it should be removed, then I'm glad to update 
this patch to do so (which involves adding a MiMa exclusion, I think).  We 
could also just leave it and decide later.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9222] [MLlib] Make class instantiation ...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7573#issuecomment-124342664
  
Merged build finished. Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8951][SparkR] support Unicode character...

2015-07-24 Thread CHOIJAEHONG1
Github user CHOIJAEHONG1 commented on the pull request:

https://github.com/apache/spark/pull/7494#issuecomment-124344827
  
good job, jenkins.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7305#issuecomment-124344817
  
  [Test build #38323 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38323/consoleFull)
 for   PR 7305 at commit 
[`de3423f`](https://github.com/apache/spark/commit/de3423fa4327690f63bf5e6ab8bbee6955cc0da9).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7305#issuecomment-124345035
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...

2015-07-24 Thread rxin
Github user rxin commented on the pull request:

https://github.com/apache/spark/pull/7637#issuecomment-124345445
  
cc @JoshRosen 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...

2015-07-24 Thread rxin
GitHub user rxin opened a pull request:

https://github.com/apache/spark/pull/7637

[build] Enable memory leak detection for Tungsten.

This was turned off accidentally in #7591.

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/rxin/spark enable-mem-leak-detect

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/7637.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #7637


commit 34bc3efa60480f3af73cec4f85132e1afc2c5c7c
Author: Reynold Xin r...@databricks.com
Date:   2015-07-24T06:29:27Z

Enable memory leak detection for Tungsten.




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...

2015-07-24 Thread rxin
Github user rxin commented on the pull request:

https://github.com/apache/spark/pull/7583#issuecomment-124352237
  
cc @yhuai 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9294][SQL] cleanup comments, code style...

2015-07-24 Thread asfgit
Github user asfgit closed the pull request at:

https://github.com/apache/spark/pull/7619


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124354054
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...

2015-07-24 Thread tdas
Github user tdas commented on a diff in the pull request:

https://github.com/apache/spark/pull/7600#discussion_r35399127
  
--- Diff: 
streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala 
---
@@ -47,6 +49,14 @@ abstract class InputDStream[T: ClassTag] (@transient 
ssc_ : StreamingContext)
   /** This is an unique identifier for the input stream. */
   val id = ssc.getNewInputStreamId()
 
+  // Keep track of the freshest rate for this stream using the 
rateEstimator
+  protected[streaming] val rateController: Option[RateController] =
+RateEstimator.makeEstimator(ssc.conf).map { estimator =
--- End diff --

This should change a little. See comment on what the config params should 
be in another comment below (near `RateEstimator.makeEstimator`)


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7520#issuecomment-124360378
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7520#issuecomment-124360202
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9209] Using executor allocation, a exec...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7559#issuecomment-124371932
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9209] Using executor allocation, a exec...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7559#issuecomment-124372643
  
  [Test build #38329 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38329/consoleFull)
 for   PR 7559 at commit 
[`0e973e6`](https://github.com/apache/spark/commit/0e973e6186a121ee567c3d555939ddb860376871).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7592#discussion_r35399934
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 ---
@@ -149,31 +158,141 @@ private[joins] object HashedRelation {
   }
 }
 
+/**
+ * An extended CompactBuffer that could grow and update.
+ */
+private[joins] class MutableCompactBuffer[T: ClassTag] extends 
CompactBuffer[T] {
+  override def growToSize(newSize: Int): Unit = super.growToSize(newSize)
+  override def update(i: Int, v: T): Unit = super.update(i, v)
+}
 
 /**
  * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that 
maps the key into a
  * sequence of values.
- *
- * TODO(davies): use BytesToBytesMap
  */
 private[joins] final class UnsafeHashedRelation(
--- End diff --

you need to update the scaladoc to document the encoding here


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8867][SQL][WIP] Support list / describe...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7259#issuecomment-124376266
  
  [Test build #38314 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38314/console)
 for   PR 7259 at commit 
[`cf29bba`](https://github.com/apache/spark/commit/cf29bbadc24f8e86f9745692f617efab4eb19f4f).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `public class ExpressionInfo `
  * `case class ShowFunctions(db: Option[String], pattern: Option[String]) 
extends RunnableCommand `
  * `case class DescribeFunction(`



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8867][SQL][WIP] Support list / describe...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7259#issuecomment-124376389
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9225] [MLlib] LDASuite needs unit tests...

2015-07-24 Thread yu-iskw
Github user yu-iskw commented on the pull request:

https://github.com/apache/spark/pull/7620#issuecomment-124390015
  
@rotationsymmetry Great work! We could also implement those two tests in 
one test suite with for-loop of the optimizers instead. Since both of them are 
almost same and there is a difference about the optimizers.

@feynmanliang What do you think?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7635#issuecomment-124390023
  
  [Test build #38319 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38319/console)
 for   PR 7635 at commit 
[`d7a3535`](https://github.com/apache/spark/commit/d7a35358e2068eca9bdead2b93f3b96dcaf890d8).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `case class SequenceNumberRange(`
  * `case class SequenceNumberRanges(ranges: Array[SequenceNumberRange]) `
  * `class KinesisBackedBlockRDDPartition(`
  * `class KinesisBackedBlockRDD(`
  * `class KinesisSequenceRangeIterator(`



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7635#issuecomment-124390048
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread davies
Github user davies commented on a diff in the pull request:

https://github.com/apache/spark/pull/7592#discussion_r35401288
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 ---
@@ -149,31 +158,141 @@ private[joins] object HashedRelation {
   }
 }
 
+/**
+ * An extended CompactBuffer that could grow and update.
+ */
+private[joins] class MutableCompactBuffer[T: ClassTag] extends 
CompactBuffer[T] {
+  override def growToSize(newSize: Int): Unit = super.growToSize(newSize)
+  override def update(i: Int, v: T): Unit = super.update(i, v)
+}
 
 /**
  * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that 
maps the key into a
  * sequence of values.
- *
- * TODO(davies): use BytesToBytesMap
  */
 private[joins] final class UnsafeHashedRelation(
 private var hashTable: JavaHashMap[UnsafeRow, 
CompactBuffer[UnsafeRow]])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null)  // Needed for serialization
+  private[joins] def this() = this(null)  // Needed for serialization
+
+  // Use BytesToBytesMap in executor for better performance (it's created 
when deserialization)
+  @transient private[this] var binaryMap: BytesToBytesMap = _
+
+  // A pool of compact buffers to reduce memory garbage
+  @transient private[this] val bufferPool = new 
ThreadLocal[MutableCompactBuffer[UnsafeRow]]
 
-  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
+  override def get(key: InternalRow): Seq[InternalRow] = {
 val unsafeKey = key.asInstanceOf[UnsafeRow]
-// Thanks to type eraser
-hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]]
+
+if (binaryMap != null) {
+  // Used in Broadcast join
+  val loc = binaryMap.lookup(unsafeKey.getBaseObject, 
unsafeKey.getBaseOffset,
+unsafeKey.getSizeInBytes)
+  if (loc.isDefined) {
+// thread-local buffer
+var buffer = bufferPool.get()
+if (buffer == null) {
+  buffer = new MutableCompactBuffer[UnsafeRow]
+  bufferPool.set(buffer)
+}
+
+val base = loc.getValueAddress.getBaseObject
+var offset = loc.getValueAddress.getBaseOffset
+val last = loc.getValueAddress.getBaseOffset + loc.getValueLength
+var i = 0
+while (offset  last) {
+  val numFields = PlatformDependent.UNSAFE.getInt(base, offset)
+  val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 
4)
+  offset += 8
+
+  // try to re-use the UnsafeRow in buffer, to reduce garbage
+  buffer.growToSize(i + 1)
+  if (buffer(i) == null) {
+buffer(i) = new UnsafeRow
+  }
+  buffer(i).pointTo(base, offset, numFields, sizeInBytes, null)
+  i += 1
+  offset += sizeInBytes
+}
+buffer
+  } else {
+null
+  }
+
+} else {
+  // Use the JavaHashMap in Local mode or ShuffleHashJoin
+  hashTable.get(unsafeKey)
+}
   }
 
   override def writeExternal(out: ObjectOutput): Unit = {
-writeBytes(out, SparkSqlSerializer.serialize(hashTable))
+out.writeInt(hashTable.size())
+
+val iter = hashTable.entrySet().iterator()
+while (iter.hasNext) {
+  val entry = iter.next()
+  val key = entry.getKey
+  val values = entry.getValue
+
+  // write all the values as single byte array
+  var totalSize = 0L
+  var i = 0
+  while (i  values.size) {
+totalSize += values(i).getSizeInBytes + 4 + 4
+i += 1
+  }
+  assert(totalSize  Integer.MAX_VALUE, values are too big)
+
+  // [key size] [values size] [key bytes] [values bytes]
+  out.writeInt(key.getSizeInBytes)
+  out.writeInt(totalSize.toInt)
+  out.write(key.getBytes)
+  i = 0
+  while (i  values.size) {
+// [num of fields] [num of bytes] [row bytes]
+// write the integer in native order, so they can be read by 
UNSAFE.getInt()
+if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
+  out.writeInt(values(i).length())
+  out.writeInt(values(i).getSizeInBytes)
+} else {
+  out.writeInt(Integer.reverseBytes(values(i).length()))
+  out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes))
+}
+out.write(values(i).getBytes)
+i += 1
+  }
+}
   }
 
   override def readExternal(in: ObjectInput): Unit = {
-hashTable = SparkSqlSerializer.deserialize(readBytes(in))
+val 

[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread davies
Github user davies commented on a diff in the pull request:

https://github.com/apache/spark/pull/7592#discussion_r35401411
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala ---
@@ -65,18 +72,16 @@ trait HashJoin {
   {
 new Iterator[InternalRow] {
   private[this] var currentStreamedRow: InternalRow = _
-  private[this] var currentHashMatches: CompactBuffer[InternalRow] = _
+  private[this] var currentHashMatches: Seq[InternalRow] = _
--- End diff --

cc @JoshRosen 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9304] [BUILD] Improve backwards compati...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7639#issuecomment-124394692
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7592#discussion_r35401495
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 ---
@@ -149,31 +158,141 @@ private[joins] object HashedRelation {
   }
 }
 
+/**
+ * An extended CompactBuffer that could grow and update.
+ */
+private[joins] class MutableCompactBuffer[T: ClassTag] extends 
CompactBuffer[T] {
+  override def growToSize(newSize: Int): Unit = super.growToSize(newSize)
+  override def update(i: Int, v: T): Unit = super.update(i, v)
+}
 
 /**
  * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that 
maps the key into a
  * sequence of values.
- *
- * TODO(davies): use BytesToBytesMap
  */
 private[joins] final class UnsafeHashedRelation(
 private var hashTable: JavaHashMap[UnsafeRow, 
CompactBuffer[UnsafeRow]])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null)  // Needed for serialization
+  private[joins] def this() = this(null)  // Needed for serialization
+
+  // Use BytesToBytesMap in executor for better performance (it's created 
when deserialization)
+  @transient private[this] var binaryMap: BytesToBytesMap = _
+
+  // A pool of compact buffers to reduce memory garbage
+  @transient private[this] val bufferPool = new 
ThreadLocal[MutableCompactBuffer[UnsafeRow]]
 
-  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
+  override def get(key: InternalRow): Seq[InternalRow] = {
 val unsafeKey = key.asInstanceOf[UnsafeRow]
-// Thanks to type eraser
-hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]]
+
+if (binaryMap != null) {
+  // Used in Broadcast join
+  val loc = binaryMap.lookup(unsafeKey.getBaseObject, 
unsafeKey.getBaseOffset,
+unsafeKey.getSizeInBytes)
+  if (loc.isDefined) {
+// thread-local buffer
+var buffer = bufferPool.get()
+if (buffer == null) {
+  buffer = new MutableCompactBuffer[UnsafeRow]
+  bufferPool.set(buffer)
+}
+
+val base = loc.getValueAddress.getBaseObject
+var offset = loc.getValueAddress.getBaseOffset
+val last = loc.getValueAddress.getBaseOffset + loc.getValueLength
+var i = 0
+while (offset  last) {
+  val numFields = PlatformDependent.UNSAFE.getInt(base, offset)
+  val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 
4)
+  offset += 8
+
+  // try to re-use the UnsafeRow in buffer, to reduce garbage
+  buffer.growToSize(i + 1)
+  if (buffer(i) == null) {
+buffer(i) = new UnsafeRow
+  }
+  buffer(i).pointTo(base, offset, numFields, sizeInBytes, null)
+  i += 1
+  offset += sizeInBytes
+}
+buffer
+  } else {
+null
+  }
+
+} else {
+  // Use the JavaHashMap in Local mode or ShuffleHashJoin
+  hashTable.get(unsafeKey)
+}
   }
 
   override def writeExternal(out: ObjectOutput): Unit = {
-writeBytes(out, SparkSqlSerializer.serialize(hashTable))
+out.writeInt(hashTable.size())
+
+val iter = hashTable.entrySet().iterator()
+while (iter.hasNext) {
+  val entry = iter.next()
+  val key = entry.getKey
+  val values = entry.getValue
+
+  // write all the values as single byte array
+  var totalSize = 0L
+  var i = 0
+  while (i  values.size) {
+totalSize += values(i).getSizeInBytes + 4 + 4
+i += 1
+  }
+  assert(totalSize  Integer.MAX_VALUE, values are too big)
+
+  // [key size] [values size] [key bytes] [values bytes]
+  out.writeInt(key.getSizeInBytes)
+  out.writeInt(totalSize.toInt)
+  out.write(key.getBytes)
+  i = 0
+  while (i  values.size) {
+// [num of fields] [num of bytes] [row bytes]
+// write the integer in native order, so they can be read by 
UNSAFE.getInt()
+if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
+  out.writeInt(values(i).length())
+  out.writeInt(values(i).getSizeInBytes)
+} else {
+  out.writeInt(Integer.reverseBytes(values(i).length()))
+  out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes))
+}
+out.write(values(i).getBytes)
+i += 1
+  }
+}
   }
 
   override def readExternal(in: ObjectInput): Unit = {
-hashTable = SparkSqlSerializer.deserialize(readBytes(in))
+val 

[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7305#issuecomment-124400159
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7305#issuecomment-124399958
  
  [Test build #87 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SlowSparkPullRequestBuilder/87/console)
 for   PR 7305 at commit 
[`de3423f`](https://github.com/apache/spark/commit/de3423fa4327690f63bf5e6ab8bbee6955cc0da9).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][MLLIB][SPARK-4675][SPARK-4823]RowSimilar...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/6213#issuecomment-124399886
  
Build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][MLLIB][SPARK-4675][SPARK-4823]RowSimilar...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/6213#issuecomment-124399778
  
 Build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...

2015-07-24 Thread ericl
Github user ericl commented on a diff in the pull request:

https://github.com/apache/spark/pull/7574#discussion_r35397462
  
--- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala 
---
@@ -130,9 +173,52 @@ class RFormula(override val uid: String)
   Label column already exists and is not of type DoubleType.)
   }
 
-  private def hasLabelCol(schema: StructType): Boolean = {
-schema.map(_.name).contains($(labelCol))
+  private def featureTransformer(schema: StructType): Transformer = {
+// StringType terms and terms representing interactions need to be 
encoded before assembly.
+// TODO(ekl) add support for feature interactions
+var encoderStages = Seq[Transformer]()
+var tempColumns = Seq[String]()
+val encodedTerms = parsedFormula.terms.map { term =
+  schema(term) match {
+case column if column.dataType == StringType =
+  val encodedTerm = term + _onehot_ + uid
+  val indexer = factorLevels(term)
+  val indexCol = indexer.getOrDefault(indexer.outputCol)
+  encoderStages :+= indexer
+  encoderStages :+= new OneHotEncoder()
+.setInputCol(indexCol)
+.setOutputCol(encodedTerm)
+  tempColumns :+= encodedTerm
+  tempColumns :+= indexCol
+  encodedTerm
+case _ =
+  term
+  }
+}
+encoderStages :+= new VectorAssembler(uid)
+  .setInputCols(encodedTerms.toArray)
+  .setOutputCol($(featuresCol))
+encoderStages :+= new ColumnPruner(tempColumns.toSet)
+new PipelineModel(uid, encoderStages.toArray)
+  }
+}
+
+/**
+ * Utility transformer for removing temporary columns from a DataFrame.
+ */
+private class ColumnPruner(columnsToPrune: Set[String]) extends 
Transformer {
--- End diff --

Done


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...

2015-07-24 Thread ericl
Github user ericl commented on a diff in the pull request:

https://github.com/apache/spark/pull/7574#discussion_r35397464
  
--- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala 
---
@@ -130,9 +173,52 @@ class RFormula(override val uid: String)
   Label column already exists and is not of type DoubleType.)
   }
 
-  private def hasLabelCol(schema: StructType): Boolean = {
-schema.map(_.name).contains($(labelCol))
+  private def featureTransformer(schema: StructType): Transformer = {
+// StringType terms and terms representing interactions need to be 
encoded before assembly.
+// TODO(ekl) add support for feature interactions
+var encoderStages = Seq[Transformer]()
+var tempColumns = Seq[String]()
+val encodedTerms = parsedFormula.terms.map { term =
+  schema(term) match {
+case column if column.dataType == StringType =
+  val encodedTerm = term + _onehot_ + uid
+  val indexer = factorLevels(term)
+  val indexCol = indexer.getOrDefault(indexer.outputCol)
+  encoderStages :+= indexer
+  encoderStages :+= new OneHotEncoder()
+.setInputCol(indexCol)
+.setOutputCol(encodedTerm)
+  tempColumns :+= encodedTerm
+  tempColumns :+= indexCol
+  encodedTerm
+case _ =
+  term
+  }
+}
+encoderStages :+= new VectorAssembler(uid)
+  .setInputCols(encodedTerms.toArray)
+  .setOutputCol($(featuresCol))
+encoderStages :+= new ColumnPruner(tempColumns.toSet)
+new PipelineModel(uid, encoderStages.toArray)
+  }
+}
+
+/**
+ * Utility transformer for removing temporary columns from a DataFrame.
+ */
+private class ColumnPruner(columnsToPrune: Set[String]) extends 
Transformer {
+  override val uid = Identifiable.randomUID(columnPruner)
+  override def transform(dataset: DataFrame): DataFrame = {
+var res: DataFrame = dataset
+for (column - columnsToPrune) {
+  res = res.drop(column)
--- End diff --

Done


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...

2015-07-24 Thread ericl
Github user ericl commented on a diff in the pull request:

https://github.com/apache/spark/pull/7574#discussion_r35397461
  
--- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala 
---
@@ -62,19 +77,60 @@ class RFormula(override val uid: String)
   /** @group getParam */
   def getFormula: String = $(formula)
 
-  /** @group getParam */
-  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+  override def fit(dataset: DataFrame): RFormulaModel = {
+require(parsedFormula.isDefined, Must call setFormula() first.)
+val factorLevels = parsedFormula.get.terms.flatMap { term =
+  dataset.schema(term) match {
+case column if column.dataType == StringType =
+  val idxTerm = term + _idx_ + uid
+  val indexer = new 
StringIndexer().setInputCol(term).setOutputCol(idxTerm)
+  Some(term - indexer.fit(dataset))
+case _ =
+  None
--- End diff --

Done


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread JoshRosen
GitHub user JoshRosen opened a pull request:

https://github.com/apache/spark/pull/7635

[SPARK-9303] Decimal should use java.math.Decimal directly instead of using 
Scala wrapper

Spark SQL's `Decimal` class should use Java's BigDecimal instead of 
Scala's, since this removes a layer of object allocation and works around an 
issue where Scala's BigDecimal.hashCode() can lead to OOMs (see 
https://issues.scala-lang.org/browse/SI-6173).

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/JoshRosen/spark SPARK-9303

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/7635.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #7635


commit d7a35358e2068eca9bdead2b93f3b96dcaf890d8
Author: Josh Rosen joshro...@databricks.com
Date:   2015-07-24T06:02:13Z

[SPARK-9303] Decimal should use java.math.Decimal directly instead of via 
Scala wrapper




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7635#discussion_r35397756
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala ---
@@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with 
Serializable {
 this
   }
 
-  def toBigDecimal: BigDecimal = {
+  def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal)
--- End diff --

Decimal is not meant to be public. I think it just accidentally became 
public.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7636#issuecomment-124345665
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7636#issuecomment-124345653
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7637#issuecomment-124345647
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7637#issuecomment-124345667
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124345657
  
  [Test build #38321 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38321/console)
 for   PR 7634 at commit 
[`6a91f32`](https://github.com/apache/spark/commit/6a91f32112c02a507319bc7d090b9256739181e4).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `case class ChangeDecimalPrecision(child: Expression) extends 
UnaryExpression `



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124345679
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7574#issuecomment-124346736
  
Merged build finished. Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7574#issuecomment-124346148
  
  [Test build #38316 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38316/console)
 for   PR 7574 at commit 
[`c302a2c`](https://github.com/apache/spark/commit/c302a2c40088de89feb37964f182de33279df818).
 * This patch **passes all tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `class RFormula(override val uid: String) extends 
Estimator[RFormulaModel] with RFormulaBase `



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124354807
  
  [Test build #38326 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38326/consoleFull)
 for   PR 7634 at commit 
[`65b251c`](https://github.com/apache/spark/commit/65b251cf40fd9975d3c194e49ba27038f351069c).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...

2015-07-24 Thread viirya
Github user viirya commented on a diff in the pull request:

https://github.com/apache/spark/pull/7520#discussion_r35398948
  
--- Diff: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala ---
@@ -86,19 +86,10 @@ private[orc] class OrcOutputWriter(
   TypeInfoUtils.getTypeInfoFromTypeString(
 HiveMetastoreTypes.toMetastoreType(dataSchema))
 
-TypeInfoUtils
-  .getStandardJavaObjectInspectorFromTypeInfo(typeInfo)
-  .asInstanceOf[StructObjectInspector]
+OrcStruct.createObjectInspector(typeInfo.asInstanceOf[StructTypeInfo])
+  .asInstanceOf[SettableStructObjectInspector]
--- End diff --

`OrcStruct.createObjectInspector ` will return a `OrcStructInspector` for 
STRUCT data type. It is a `SettableStructObjectInspector`. Because we pass a 
`StructTypeInfo` when calling `createObjectInspector`, I think it should be 
safe.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124353841
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7592#discussion_r35399269
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 ---
@@ -149,31 +158,141 @@ private[joins] object HashedRelation {
   }
 }
 
+/**
+ * An extended CompactBuffer that could grow and update.
+ */
+private[joins] class MutableCompactBuffer[T: ClassTag] extends 
CompactBuffer[T] {
+  override def growToSize(newSize: Int): Unit = super.growToSize(newSize)
+  override def update(i: Int, v: T): Unit = super.update(i, v)
+}
 
 /**
  * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that 
maps the key into a
  * sequence of values.
- *
- * TODO(davies): use BytesToBytesMap
  */
 private[joins] final class UnsafeHashedRelation(
 private var hashTable: JavaHashMap[UnsafeRow, 
CompactBuffer[UnsafeRow]])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null)  // Needed for serialization
+  private[joins] def this() = this(null)  // Needed for serialization
+
+  // Use BytesToBytesMap in executor for better performance (it's created 
when deserialization)
+  @transient private[this] var binaryMap: BytesToBytesMap = _
+
+  // A pool of compact buffers to reduce memory garbage
+  @transient private[this] val bufferPool = new 
ThreadLocal[MutableCompactBuffer[UnsafeRow]]
--- End diff --

buffer pool has a special meaning in databases. we should pick a different 
name.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...

2015-07-24 Thread tdas
Github user tdas commented on a diff in the pull request:

https://github.com/apache/spark/pull/7600#discussion_r35399262
  
--- Diff: 
streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala
 ---
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import java.util.concurrent.atomic.AtomicLong
+
+import scala.concurrent.{ExecutionContext, Future}
+
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * A StreamingListener that receives batch completion updates, and 
maintains
+ * an estimate of the speed at which this stream should ingest messages,
+ * given an estimate computation from a `RateEstimator`
+ */
+private [streaming] abstract class RateController(val streamUID: Int, 
rateEstimator: RateEstimator)
+  extends StreamingListener with Serializable {
+
+  protected def publish(rate: Long): Unit
+
+  // Used to compute  publish the rate update asynchronously
+  @transient
+  implicit private val executionContext = 
ExecutionContext.fromExecutorService(
+ThreadUtils.newDaemonSingleThreadExecutor(stream-rate-update))
+
+  private val rateLimit: AtomicLong = new AtomicLong(-1L)
+
+  /**
+   * Compute the new rate limit and publish it asynchronously.
+   */
+  private def computeAndPublish(time: Long, elems: Long, workDelay: Long, 
waitDelay: Long): Unit =
+Future[Unit] {
+  val newSpeed = rateEstimator.compute(time, elems, workDelay, 
waitDelay)
+  newSpeed foreach { s =
--- End diff --

We dont generally use postfix notation. use `newSpeed.foraech`


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9293] Analysis should check that set op...

2015-07-24 Thread cloud-fan
Github user cloud-fan commented on the pull request:

https://github.com/apache/spark/pull/7631#issuecomment-124362448
  
I tried it locally, hive will report error if will union 2 `select` with 
different output length. But in our test we union 2 `InsertIntoTable` 
operations which seems a special case for hive(or maybe the output of 
`InsertIntoTable` is `Nil` in hive?). 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...

2015-07-24 Thread tdas
Github user tdas commented on a diff in the pull request:

https://github.com/apache/spark/pull/7600#discussion_r35399267
  
--- Diff: 
streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala
 ---
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import scala.collection.mutable
+import scala.reflect.ClassTag
+import scala.util.control.NonFatal
+
+import org.scalatest.Matchers._
+import org.scalatest.concurrent.Eventually._
+import org.scalatest.time.SpanSugar._
+
+import org.apache.spark.streaming._
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
+
+
+
--- End diff --

nit: extra line


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...

2015-07-24 Thread tdas
Github user tdas commented on a diff in the pull request:

https://github.com/apache/spark/pull/7600#discussion_r35399265
  
--- Diff: 
streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
 ---
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler.rate
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkException
+
+/**
+ * A component that estimates the rate at wich an InputDStream should 
ingest
+ * elements, based on updates at every batch completion.
+ */
+private[streaming] trait RateEstimator extends Serializable {
+
+  /**
+   * Computes the number of elements the stream attached to this 
`RateEstimator`
+   * should ingest per second, given an update on the size and completion
+   * times of the latest batch.
+   *
+   * @param time The timetamp of the current batch interval that just 
finished
+   * @param elements The number of elements that were processed in this 
batch
+   * @param processingDelay The time in ms that took for the job to 
complete
+   * @param schedulingDelay The time in ms that the job spent in the 
scheduling queue
+   */
+  def compute(
+  time: Long,
+  elements: Long,
+  processingDelay: Long,
+  schedulingDelay: Long): Option[Double]
+}
+
+object RateEstimator {
+
+  /**
+   * Return a new RateEstimator based on the value of 
`spark.streaming.RateEstimator`.
+   *
+   * @return None if there is no configured estimator, otherwise an 
instance of RateEstimator
+   * @throws IllegalArgumentException if there is a configured 
RateEstimator that doesn't match any
+   * known estimators.
+   */
+  def makeEstimator(conf: SparkConf): Option[RateEstimator] =
+conf.getOption(spark.streaming.RateEstimator) map { estimator =
--- End diff --

Let's make the configuration params as follows
`spark.streaming.backpressure.enable = true/false` to enable/disable the 
whole feature. false by default in 1.5. Also to the docs/configuration.md.
`spark.streaming.backpressure.rateEstimator = pid` to specify which 
algorithm to use for estimating. DO NOT add this to the docs/configuration.

Basically, the scope `spark.streaming.backpressure` will contain on 
backpressure related configuration. 

Accordingly, the RateController in InputDStreams will be created as 
`RateController.create()`, and the estimator will be created by 
`RateEstimator.create`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...

2015-07-24 Thread tdas
Github user tdas commented on a diff in the pull request:

https://github.com/apache/spark/pull/7600#discussion_r35399259
  
--- Diff: 
streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala
 ---
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import java.util.concurrent.atomic.AtomicLong
+
+import scala.concurrent.{ExecutionContext, Future}
+
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * A StreamingListener that receives batch completion updates, and 
maintains
+ * an estimate of the speed at which this stream should ingest messages,
+ * given an estimate computation from a `RateEstimator`
+ */
+private [streaming] abstract class RateController(val streamUID: Int, 
rateEstimator: RateEstimator)
+  extends StreamingListener with Serializable {
+
+  protected def publish(rate: Long): Unit
+
+  // Used to compute  publish the rate update asynchronously
+  @transient
+  implicit private val executionContext = 
ExecutionContext.fromExecutorService(
+ThreadUtils.newDaemonSingleThreadExecutor(stream-rate-update))
+
+  private val rateLimit: AtomicLong = new AtomicLong(-1L)
+
+  /**
+   * Compute the new rate limit and publish it asynchronously.
+   */
+  private def computeAndPublish(time: Long, elems: Long, workDelay: Long, 
waitDelay: Long): Unit =
+Future[Unit] {
+  val newSpeed = rateEstimator.compute(time, elems, workDelay, 
waitDelay)
--- End diff --

newSpeed  -- newRate


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7520#issuecomment-124365064
  
  [Test build #38327 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38327/consoleFull)
 for   PR 7520 at commit 
[`96796da`](https://github.com/apache/spark/commit/96796da0abaf695bcda52fbd395413bf6e9520de).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...

2015-07-24 Thread tdas
Github user tdas commented on the pull request:

https://github.com/apache/spark/pull/7600#issuecomment-124363987
  
Time to remove the WIP from the title :) Its looking good. Will take 
another pass once these comments are addressed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9209] Using executor allocation, a exec...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7559#issuecomment-124372120
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9043] Serialize key, value and combiner...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7403#issuecomment-124379596
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9043] Serialize key, value and combiner...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7403#issuecomment-124379407
  
  [Test build #38310 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38310/console)
 for   PR 7403 at commit 
[`d07b771`](https://github.com/apache/spark/commit/d07b771f787166257691d8c4e9bce2adfc189860).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](`
  * `class CoGroupedRDD[K: ClassTag](@transient var rdds: Seq[RDD[_ : 
Product2[K, _]]],`
  * `class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](`



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9285][SQL] Remove InternalRow's inherit...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7626#issuecomment-124390396
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9285][SQL] Remove InternalRow's inherit...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7626#issuecomment-124390411
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-5155] [PySpark] [Streaming] Mqtt stream...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/4229#issuecomment-124390582
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-5155] [PySpark] [Streaming] Mqtt stream...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/4229#issuecomment-124390541
  
  [Test build #86 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SlowSparkPullRequestBuilder/86/console)
 for   PR 4229 at commit 
[`87fc677`](https://github.com/apache/spark/commit/87fc6771586aaf025c2810b5fa3a160a9773c4d2).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `class MQTTUtils(object):`



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread rxin
Github user rxin commented on the pull request:

https://github.com/apache/spark/pull/7592#issuecomment-124398910
  
@davies this is ok, but what I had in mind was pretty different. What I had 
in mind was to create a version of the join operator, that could take more 
advantage of what unsafe provides. There are still quite a bit of conversions 
involved right now. HashRelation is also limiting you, due to the shared 
variable across multiple threads.

I think we will need to create a new join operator soon anyway, in order to 
avoid the join row conversion.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-4352][YARN][WIP] Incorporate locality p...

2015-07-24 Thread jerryshao
Github user jerryshao commented on a diff in the pull request:

https://github.com/apache/spark/pull/6394#discussion_r35401677
  
--- Diff: 
core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala ---
@@ -28,7 +28,10 @@ private[spark] trait ExecutorAllocationClient {
* This can result in canceling pending requests or filing additional 
requests.
* @return whether the request is acknowledged by the cluster manager.
*/
-  private[spark] def requestTotalExecutors(numExecutors: Int): Boolean
+  private[spark] def requestTotalExecutors(
+  numExecutors: Int,
--- End diff --

Seems I miss this comment, will update the code.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124342521
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124342480
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7583#issuecomment-124341890
  
Merged build finished. Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7583#issuecomment-124341684
  
  [Test build #38307 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38307/console)
 for   PR 7583 at commit 
[`9d59628`](https://github.com/apache/spark/commit/9d59628bce695a2825c74e5f5d5c2fa6ab40c734).
 * This patch **passes all tests**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread davies
Github user davies commented on a diff in the pull request:

https://github.com/apache/spark/pull/7635#discussion_r35397974
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala ---
@@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with 
Serializable {
 this
   }
 
-  def toBigDecimal: BigDecimal = {
+  def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal)
--- End diff --

I think user's code should never touch Decimal


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8951][SparkR] support Unicode character...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7494#issuecomment-124343702
  
  [Test build #38304 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38304/console)
 for   PR 7494 at commit 
[`bc469d8`](https://github.com/apache/spark/commit/bc469d8ae91024675abb37b76a1354c99206c5c2).
 * This patch **passes all tests**.
 * This patch merges cleanly.
 * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8951][SparkR] support Unicode character...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7494#issuecomment-124343874
  
Merged build finished. Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9069] [SQL] follow up

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7634#issuecomment-124342964
  
  [Test build #38321 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38321/consoleFull)
 for   PR 7634 at commit 
[`6a91f32`](https://github.com/apache/spark/commit/6a91f32112c02a507319bc7d090b9256739181e4).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7305#issuecomment-124345120
  
  [Test build #87 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SlowSparkPullRequestBuilder/87/consoleFull)
 for   PR 7305 at commit 
[`de3423f`](https://github.com/apache/spark/commit/de3423fa4327690f63bf5e6ab8bbee6955cc0da9).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...

2015-07-24 Thread rxin
GitHub user rxin opened a pull request:

https://github.com/apache/spark/pull/7636

[SPARK-9200][SQL] Don't implicitly cast non-atomic types to string type.



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/rxin/spark complex-string-implicit-cast

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/7636.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #7636


commit 3e67327e72ac4fe6f9e1ade4fe56d140fa2303d1
Author: Reynold Xin r...@databricks.com
Date:   2015-07-24T06:28:01Z

[SPARK-9200][SQL] Don't implicitly cast non-atomic types to string type.




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7305#issuecomment-124345056
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7636#issuecomment-124345792
  
  [Test build #38325 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38325/consoleFull)
 for   PR 7636 at commit 
[`3e67327`](https://github.com/apache/spark/commit/3e67327e72ac4fe6f9e1ade4fe56d140fa2303d1).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7637#issuecomment-124345771
  
  [Test build #38324 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38324/consoleFull)
 for   PR 7637 at commit 
[`34bc3ef`](https://github.com/apache/spark/commit/34bc3efa60480f3af73cec4f85132e1afc2c5c7c).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7635#discussion_r35398709
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala ---
@@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with 
Serializable {
 this
   }
 
-  def toBigDecimal: BigDecimal = {
+  def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal)
--- End diff --

We should remove it, but it is ok to leave it in for now I think. Most 
likely this entire class will look very different soon...



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9152][SQL] Implement code generation fo...

2015-07-24 Thread viirya
Github user viirya commented on the pull request:

https://github.com/apache/spark/pull/7561#issuecomment-124356779
  
ping @davies @rxin 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...

2015-07-24 Thread viirya
Github user viirya commented on a diff in the pull request:

https://github.com/apache/spark/pull/7520#discussion_r35399063
  
--- Diff: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala ---
@@ -120,15 +111,11 @@ private[orc] class OrcOutputWriter(
   }
 
   override def write(row: Row): Unit = {
-var i = 0
-while (i  row.length) {
-  reusableOutputBuffer(i) = wrappers(i)(row(i))
-  i += 1
-}
+val orcRow = wrap(row, structOI)
--- End diff --

Agreed. It will be a concern. I will update this part to reuse an 
`OrcStruct`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9295] Analysis should detect sorting on...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7633#issuecomment-124365696
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9065][Streaming][PySpark] Add MessageHa...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7410#issuecomment-124365477
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9295] Analysis should detect sorting on...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7633#issuecomment-124365550
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9065][Streaming][PySpark] Add MessageHa...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7410#issuecomment-124365161
  
  [Test build #38309 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38309/console)
 for   PR 7410 at commit 
[`a09423e`](https://github.com/apache/spark/commit/a09423ed6ce4ab28d8e52e4ba0a8d80d1ae098a1).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `  case class PythonMessageAndMetadata(`
  * `  class PythonMessageAndMetadataPickler extends IObjectPickler `
  * `class KafkaMessageAndMetadata(object):`
  * `case class ChangePrecision(child: Expression) extends 
UnaryExpression `
  * `abstract class AlgebraicAggregate extends AggregateFunction2 with 
Serializable with Unevaluable `
  * `abstract class AggregateFunction1 extends LeafExpression with 
Serializable `
  * `case class DecimalType(precision: Int, scale: Int) extends 
FractionalType `
  * `  case class  DecimalConversion(precision: Int, scale: Int) extends 
JDBCConversion`



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7638#issuecomment-124383061
  
Merged build started.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/7638#issuecomment-124383030
  
 Merged build triggered.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...

2015-07-24 Thread rxin
GitHub user rxin opened a pull request:

https://github.com/apache/spark/pull/7638

[SPARK-9305] Rename org.apache.spark.Row to Item.

It's a thing used in test cases, but named Row. Pretty annoying because 
everytime I search for Row, it shows up before the Spark SQL Row, which is what 
a developer wants most of the time.


You can merge this pull request into a Git repository by running:

$ git pull https://github.com/rxin/spark remove-row

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/7638.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #7638


commit aeda52d4635aaf850355d91313b437161117288b
Author: Reynold Xin r...@databricks.com
Date:   2015-07-24T07:15:30Z

[SPARK-9305] Rename org.apache.spark.Row to Item.




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/7638#issuecomment-124385556
  
  [Test build #38330 has 
started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38330/consoleFull)
 for   PR 7638 at commit 
[`aeda52d`](https://github.com/apache/spark/commit/aeda52d4635aaf850355d91313b437161117288b).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8564][Streaming]Add the Python API for ...

2015-07-24 Thread SparkQA
Github user SparkQA commented on the pull request:

https://github.com/apache/spark/pull/6955#issuecomment-124389877
  
  [Test build #38312 has 
finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38312/console)
 for   PR 6955 at commit 
[`6c37395`](https://github.com/apache/spark/commit/6c37395b2a4d0d56aa4a8d9ec76899c6066b29ab).
 * This patch **fails Spark unit tests**.
 * This patch merges cleanly.
 * This patch adds the following public classes _(experimental)_:
  * `class KinesisUtils(object):`
  * `class InitialPositionInStream(object):`



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-8564][Streaming]Add the Python API for ...

2015-07-24 Thread AmplabJenkins
Github user AmplabJenkins commented on the pull request:

https://github.com/apache/spark/pull/6955#issuecomment-124389905
  
Merged build finished. Test FAILed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7592#discussion_r35400866
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 ---
@@ -149,31 +158,141 @@ private[joins] object HashedRelation {
   }
 }
 
+/**
+ * An extended CompactBuffer that could grow and update.
+ */
+private[joins] class MutableCompactBuffer[T: ClassTag] extends 
CompactBuffer[T] {
+  override def growToSize(newSize: Int): Unit = super.growToSize(newSize)
+  override def update(i: Int, v: T): Unit = super.update(i, v)
+}
 
 /**
  * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that 
maps the key into a
  * sequence of values.
- *
- * TODO(davies): use BytesToBytesMap
  */
 private[joins] final class UnsafeHashedRelation(
 private var hashTable: JavaHashMap[UnsafeRow, 
CompactBuffer[UnsafeRow]])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null)  // Needed for serialization
+  private[joins] def this() = this(null)  // Needed for serialization
+
+  // Use BytesToBytesMap in executor for better performance (it's created 
when deserialization)
+  @transient private[this] var binaryMap: BytesToBytesMap = _
+
+  // A pool of compact buffers to reduce memory garbage
+  @transient private[this] val bufferPool = new 
ThreadLocal[MutableCompactBuffer[UnsafeRow]]
 
-  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
+  override def get(key: InternalRow): Seq[InternalRow] = {
 val unsafeKey = key.asInstanceOf[UnsafeRow]
-// Thanks to type eraser
-hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]]
+
+if (binaryMap != null) {
+  // Used in Broadcast join
+  val loc = binaryMap.lookup(unsafeKey.getBaseObject, 
unsafeKey.getBaseOffset,
+unsafeKey.getSizeInBytes)
+  if (loc.isDefined) {
+// thread-local buffer
+var buffer = bufferPool.get()
+if (buffer == null) {
+  buffer = new MutableCompactBuffer[UnsafeRow]
+  bufferPool.set(buffer)
+}
+
+val base = loc.getValueAddress.getBaseObject
+var offset = loc.getValueAddress.getBaseOffset
+val last = loc.getValueAddress.getBaseOffset + loc.getValueLength
+var i = 0
+while (offset  last) {
+  val numFields = PlatformDependent.UNSAFE.getInt(base, offset)
+  val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 
4)
+  offset += 8
+
+  // try to re-use the UnsafeRow in buffer, to reduce garbage
+  buffer.growToSize(i + 1)
+  if (buffer(i) == null) {
+buffer(i) = new UnsafeRow
+  }
+  buffer(i).pointTo(base, offset, numFields, sizeInBytes, null)
+  i += 1
+  offset += sizeInBytes
+}
+buffer
+  } else {
+null
+  }
+
+} else {
+  // Use the JavaHashMap in Local mode or ShuffleHashJoin
+  hashTable.get(unsafeKey)
+}
   }
 
   override def writeExternal(out: ObjectOutput): Unit = {
-writeBytes(out, SparkSqlSerializer.serialize(hashTable))
+out.writeInt(hashTable.size())
+
+val iter = hashTable.entrySet().iterator()
+while (iter.hasNext) {
+  val entry = iter.next()
+  val key = entry.getKey
+  val values = entry.getValue
+
+  // write all the values as single byte array
+  var totalSize = 0L
+  var i = 0
+  while (i  values.size) {
+totalSize += values(i).getSizeInBytes + 4 + 4
+i += 1
+  }
+  assert(totalSize  Integer.MAX_VALUE, values are too big)
+
+  // [key size] [values size] [key bytes] [values bytes]
+  out.writeInt(key.getSizeInBytes)
+  out.writeInt(totalSize.toInt)
+  out.write(key.getBytes)
+  i = 0
+  while (i  values.size) {
+// [num of fields] [num of bytes] [row bytes]
+// write the integer in native order, so they can be read by 
UNSAFE.getInt()
+if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
+  out.writeInt(values(i).length())
+  out.writeInt(values(i).getSizeInBytes)
+} else {
+  out.writeInt(Integer.reverseBytes(values(i).length()))
+  out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes))
+}
+out.write(values(i).getBytes)
+i += 1
+  }
+}
   }
 
   override def readExternal(in: ObjectInput): Unit = {
-hashTable = SparkSqlSerializer.deserialize(readBytes(in))
+val 

[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...

2015-07-24 Thread rxin
Github user rxin commented on a diff in the pull request:

https://github.com/apache/spark/pull/7592#discussion_r35400801
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
 ---
@@ -149,31 +158,141 @@ private[joins] object HashedRelation {
   }
 }
 
+/**
+ * An extended CompactBuffer that could grow and update.
+ */
+private[joins] class MutableCompactBuffer[T: ClassTag] extends 
CompactBuffer[T] {
+  override def growToSize(newSize: Int): Unit = super.growToSize(newSize)
+  override def update(i: Int, v: T): Unit = super.update(i, v)
+}
 
 /**
  * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that 
maps the key into a
  * sequence of values.
- *
- * TODO(davies): use BytesToBytesMap
  */
 private[joins] final class UnsafeHashedRelation(
 private var hashTable: JavaHashMap[UnsafeRow, 
CompactBuffer[UnsafeRow]])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null)  // Needed for serialization
+  private[joins] def this() = this(null)  // Needed for serialization
+
+  // Use BytesToBytesMap in executor for better performance (it's created 
when deserialization)
+  @transient private[this] var binaryMap: BytesToBytesMap = _
+
+  // A pool of compact buffers to reduce memory garbage
+  @transient private[this] val bufferPool = new 
ThreadLocal[MutableCompactBuffer[UnsafeRow]]
 
-  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
+  override def get(key: InternalRow): Seq[InternalRow] = {
 val unsafeKey = key.asInstanceOf[UnsafeRow]
-// Thanks to type eraser
-hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]]
+
+if (binaryMap != null) {
+  // Used in Broadcast join
+  val loc = binaryMap.lookup(unsafeKey.getBaseObject, 
unsafeKey.getBaseOffset,
+unsafeKey.getSizeInBytes)
+  if (loc.isDefined) {
+// thread-local buffer
+var buffer = bufferPool.get()
+if (buffer == null) {
+  buffer = new MutableCompactBuffer[UnsafeRow]
+  bufferPool.set(buffer)
+}
+
+val base = loc.getValueAddress.getBaseObject
+var offset = loc.getValueAddress.getBaseOffset
+val last = loc.getValueAddress.getBaseOffset + loc.getValueLength
+var i = 0
+while (offset  last) {
+  val numFields = PlatformDependent.UNSAFE.getInt(base, offset)
+  val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 
4)
+  offset += 8
+
+  // try to re-use the UnsafeRow in buffer, to reduce garbage
+  buffer.growToSize(i + 1)
+  if (buffer(i) == null) {
+buffer(i) = new UnsafeRow
+  }
+  buffer(i).pointTo(base, offset, numFields, sizeInBytes, null)
+  i += 1
+  offset += sizeInBytes
+}
+buffer
+  } else {
+null
+  }
+
+} else {
+  // Use the JavaHashMap in Local mode or ShuffleHashJoin
+  hashTable.get(unsafeKey)
+}
   }
 
   override def writeExternal(out: ObjectOutput): Unit = {
-writeBytes(out, SparkSqlSerializer.serialize(hashTable))
+out.writeInt(hashTable.size())
+
+val iter = hashTable.entrySet().iterator()
+while (iter.hasNext) {
+  val entry = iter.next()
+  val key = entry.getKey
+  val values = entry.getValue
+
+  // write all the values as single byte array
+  var totalSize = 0L
+  var i = 0
+  while (i  values.size) {
+totalSize += values(i).getSizeInBytes + 4 + 4
+i += 1
+  }
+  assert(totalSize  Integer.MAX_VALUE, values are too big)
+
+  // [key size] [values size] [key bytes] [values bytes]
+  out.writeInt(key.getSizeInBytes)
+  out.writeInt(totalSize.toInt)
+  out.write(key.getBytes)
+  i = 0
+  while (i  values.size) {
+// [num of fields] [num of bytes] [row bytes]
+// write the integer in native order, so they can be read by 
UNSAFE.getInt()
+if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
+  out.writeInt(values(i).length())
+  out.writeInt(values(i).getSizeInBytes)
+} else {
+  out.writeInt(Integer.reverseBytes(values(i).length()))
+  out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes))
+}
+out.write(values(i).getBytes)
+i += 1
+  }
+}
   }
 
   override def readExternal(in: ObjectInput): Unit = {
-hashTable = SparkSqlSerializer.deserialize(readBytes(in))
+val 

  1   2   3   4   5   6   7   8   9   >