[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user JoshRosen commented on the pull request: https://github.com/apache/spark/pull/7635#issuecomment-124339199 /cc @rxin @davies --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SPARK-9264] [SQL] remove unlimit...
Github user davies commented on a diff in the pull request: https://github.com/apache/spark/pull/7605#discussion_r35397614 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala --- @@ -334,144 +334,94 @@ object HiveTypeCoercion { * - SHORT gets turned into DECIMAL(5, 0) * - INT gets turned into DECIMAL(10, 0) * - LONG gets turned into DECIMAL(20, 0) - * - FLOAT and DOUBLE - * 1. Union, Intersect and Except operations: - * FLOAT gets turned into DECIMAL(7, 7), DOUBLE gets turned into DECIMAL(15, 15) (this is the - * same as Hive) - * 2. Other operation: - * FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE (this is the same as Hive, - * but note that unlimited decimals are considered bigger than doubles in WidenTypes) + * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE + * + * Note: Union/Except/Interact is handled by WidenTypes */ // scalastyle:on object DecimalPrecision extends Rule[LogicalPlan] { import scala.math.{max, min} -// Conversion rules for integer types into fixed-precision decimals -private val intTypeToFixed: Map[DataType, DecimalType] = Map( - ByteType - DecimalType(3, 0), - ShortType - DecimalType(5, 0), - IntegerType - DecimalType(10, 0), - LongType - DecimalType(20, 0) -) - private def isFloat(t: DataType): Boolean = t == FloatType || t == DoubleType -// Conversion rules for float and double into fixed-precision decimals -private val floatTypeToFixed: Map[DataType, DecimalType] = Map( - FloatType - DecimalType(7, 7), - DoubleType - DecimalType(15, 15) -) - -private def castDecimalPrecision( -left: LogicalPlan, -right: LogicalPlan): (LogicalPlan, LogicalPlan) = { - val castedInput = left.output.zip(right.output).map { -case (lhs, rhs) if lhs.dataType != rhs.dataType = - (lhs.dataType, rhs.dataType) match { -case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) = - // Decimals with precision/scale p1/s2 and p2/s2 will be promoted to - // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)) - val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2), max(s1, s2)) - (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, fixedType), rhs.name)()) -case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) = - (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs) -case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) = - (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)()) -case (t, DecimalType.Fixed(p, s)) if floatTypeToFixed.contains(t) = - (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs) -case (DecimalType.Fixed(p, s), t) if floatTypeToFixed.contains(t) = - (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)()) -case _ = (lhs, rhs) - } -case other = other - } - - val (castedLeft, castedRight) = castedInput.unzip +// Returns the wider decimal type that's wider than both of them +def widerDecimalType(d1: DecimalType, d2: DecimalType): DecimalType = { + widerDecimalType(d1.precision, d1.scale, d2.precision, d2.scale) +} +// max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) +def widerDecimalType(p1: Int, s1: Int, p2: Int, s2: Int): DecimalType = { + val scale = max(s1, s2) + val range = max(p1 - s1, p2 - s2) + DecimalType.bounded(range + scale, scale) +} - val newLeft = -if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) { - Project(castedLeft, left) -} else { - left -} +/** + * An expression used to wrap the children when promote the precision of DecimalType to avoid + * promote multiple times. + */ +case class ChangePrecision(child: Expression) extends UnaryExpression { + override def dataType: DataType = child.dataType + override def eval(input: InternalRow): Any = child.eval(input) + override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx) + override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = + override def prettyName: String = change_precision +} - val newRight = -if (castedRight.map(_.dataType) != right.output.map(_.dataType)) { - Project(castedRight, right) -}
[GitHub] spark pull request: [SPARK-9069] [SPARK-9264] [SQL] remove unlimit...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7605#discussion_r35397548 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala --- @@ -334,144 +334,94 @@ object HiveTypeCoercion { * - SHORT gets turned into DECIMAL(5, 0) * - INT gets turned into DECIMAL(10, 0) * - LONG gets turned into DECIMAL(20, 0) - * - FLOAT and DOUBLE - * 1. Union, Intersect and Except operations: - * FLOAT gets turned into DECIMAL(7, 7), DOUBLE gets turned into DECIMAL(15, 15) (this is the - * same as Hive) - * 2. Other operation: - * FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE (this is the same as Hive, - * but note that unlimited decimals are considered bigger than doubles in WidenTypes) + * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE + * + * Note: Union/Except/Interact is handled by WidenTypes */ // scalastyle:on object DecimalPrecision extends Rule[LogicalPlan] { import scala.math.{max, min} -// Conversion rules for integer types into fixed-precision decimals -private val intTypeToFixed: Map[DataType, DecimalType] = Map( - ByteType - DecimalType(3, 0), - ShortType - DecimalType(5, 0), - IntegerType - DecimalType(10, 0), - LongType - DecimalType(20, 0) -) - private def isFloat(t: DataType): Boolean = t == FloatType || t == DoubleType -// Conversion rules for float and double into fixed-precision decimals -private val floatTypeToFixed: Map[DataType, DecimalType] = Map( - FloatType - DecimalType(7, 7), - DoubleType - DecimalType(15, 15) -) - -private def castDecimalPrecision( -left: LogicalPlan, -right: LogicalPlan): (LogicalPlan, LogicalPlan) = { - val castedInput = left.output.zip(right.output).map { -case (lhs, rhs) if lhs.dataType != rhs.dataType = - (lhs.dataType, rhs.dataType) match { -case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) = - // Decimals with precision/scale p1/s2 and p2/s2 will be promoted to - // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)) - val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2), max(s1, s2)) - (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, fixedType), rhs.name)()) -case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) = - (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs) -case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) = - (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)()) -case (t, DecimalType.Fixed(p, s)) if floatTypeToFixed.contains(t) = - (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs) -case (DecimalType.Fixed(p, s), t) if floatTypeToFixed.contains(t) = - (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)()) -case _ = (lhs, rhs) - } -case other = other - } - - val (castedLeft, castedRight) = castedInput.unzip +// Returns the wider decimal type that's wider than both of them +def widerDecimalType(d1: DecimalType, d2: DecimalType): DecimalType = { + widerDecimalType(d1.precision, d1.scale, d2.precision, d2.scale) +} +// max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) +def widerDecimalType(p1: Int, s1: Int, p2: Int, s2: Int): DecimalType = { + val scale = max(s1, s2) + val range = max(p1 - s1, p2 - s2) + DecimalType.bounded(range + scale, scale) +} - val newLeft = -if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) { - Project(castedLeft, left) -} else { - left -} +/** + * An expression used to wrap the children when promote the precision of DecimalType to avoid + * promote multiple times. + */ +case class ChangePrecision(child: Expression) extends UnaryExpression { + override def dataType: DataType = child.dataType + override def eval(input: InternalRow): Any = child.eval(input) + override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx) + override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = + override def prettyName: String = change_precision +} - val newRight = -if (castedRight.map(_.dataType) != right.output.map(_.dataType)) { - Project(castedRight, right) -}
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7635#issuecomment-124339249 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7583#issuecomment-124339269 [Test build #38305 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38305/console) for PR 7583 at commit [`d6326d8`](https://github.com/apache/spark/commit/d6326d8d260f6ec125248e0eb25026b554248034). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7635#issuecomment-124339270 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124339271 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124339252 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7583#issuecomment-124339334 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9222] [MLlib] Make class instantiation ...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7573#issuecomment-124342620 [Test build #38313 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38313/console) for PR 7573 at commit [`2f1a293`](https://github.com/apache/spark/commit/2f1a29312f2c5ee74ba416243a8724707b034071). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user JoshRosen commented on a diff in the pull request: https://github.com/apache/spark/pull/7635#discussion_r35398027 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala --- @@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with Serializable { this } - def toBigDecimal: BigDecimal = { + def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal) --- End diff -- If we're in agreement that it should be removed, then I'm glad to update this patch to do so (which involves adding a MiMa exclusion, I think). We could also just leave it and decide later. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9222] [MLlib] Make class instantiation ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7573#issuecomment-124342664 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8951][SparkR] support Unicode character...
Github user CHOIJAEHONG1 commented on the pull request: https://github.com/apache/spark/pull/7494#issuecomment-124344827 good job, jenkins. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7305#issuecomment-124344817 [Test build #38323 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38323/consoleFull) for PR 7305 at commit [`de3423f`](https://github.com/apache/spark/commit/de3423fa4327690f63bf5e6ab8bbee6955cc0da9). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7305#issuecomment-124345035 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...
Github user rxin commented on the pull request: https://github.com/apache/spark/pull/7637#issuecomment-124345445 cc @JoshRosen --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...
GitHub user rxin opened a pull request: https://github.com/apache/spark/pull/7637 [build] Enable memory leak detection for Tungsten. This was turned off accidentally in #7591. You can merge this pull request into a Git repository by running: $ git pull https://github.com/rxin/spark enable-mem-leak-detect Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/7637.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #7637 commit 34bc3efa60480f3af73cec4f85132e1afc2c5c7c Author: Reynold Xin r...@databricks.com Date: 2015-07-24T06:29:27Z Enable memory leak detection for Tungsten. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...
Github user rxin commented on the pull request: https://github.com/apache/spark/pull/7583#issuecomment-124352237 cc @yhuai --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9294][SQL] cleanup comments, code style...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/7619 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124354054 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...
Github user tdas commented on a diff in the pull request: https://github.com/apache/spark/pull/7600#discussion_r35399127 --- Diff: streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala --- @@ -47,6 +49,14 @@ abstract class InputDStream[T: ClassTag] (@transient ssc_ : StreamingContext) /** This is an unique identifier for the input stream. */ val id = ssc.getNewInputStreamId() + // Keep track of the freshest rate for this stream using the rateEstimator + protected[streaming] val rateController: Option[RateController] = +RateEstimator.makeEstimator(ssc.conf).map { estimator = --- End diff -- This should change a little. See comment on what the config params should be in another comment below (near `RateEstimator.makeEstimator`) --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7520#issuecomment-124360378 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7520#issuecomment-124360202 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9209] Using executor allocation, a exec...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7559#issuecomment-124371932 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9209] Using executor allocation, a exec...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7559#issuecomment-124372643 [Test build #38329 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38329/consoleFull) for PR 7559 at commit [`0e973e6`](https://github.com/apache/spark/commit/0e973e6186a121ee567c3d555939ddb860376871). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35399934 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala --- @@ -149,31 +158,141 @@ private[joins] object HashedRelation { } } +/** + * An extended CompactBuffer that could grow and update. + */ +private[joins] class MutableCompactBuffer[T: ClassTag] extends CompactBuffer[T] { + override def growToSize(newSize: Int): Unit = super.growToSize(newSize) + override def update(i: Int, v: T): Unit = super.update(i, v) +} /** * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a * sequence of values. - * - * TODO(davies): use BytesToBytesMap */ private[joins] final class UnsafeHashedRelation( --- End diff -- you need to update the scaladoc to document the encoding here --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8867][SQL][WIP] Support list / describe...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7259#issuecomment-124376266 [Test build #38314 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38314/console) for PR 7259 at commit [`cf29bba`](https://github.com/apache/spark/commit/cf29bbadc24f8e86f9745692f617efab4eb19f4f). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `public class ExpressionInfo ` * `case class ShowFunctions(db: Option[String], pattern: Option[String]) extends RunnableCommand ` * `case class DescribeFunction(` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8867][SQL][WIP] Support list / describe...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7259#issuecomment-124376389 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9225] [MLlib] LDASuite needs unit tests...
Github user yu-iskw commented on the pull request: https://github.com/apache/spark/pull/7620#issuecomment-124390015 @rotationsymmetry Great work! We could also implement those two tests in one test suite with for-loop of the optimizers instead. Since both of them are almost same and there is a difference about the optimizers. @feynmanliang What do you think? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7635#issuecomment-124390023 [Test build #38319 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38319/console) for PR 7635 at commit [`d7a3535`](https://github.com/apache/spark/commit/d7a35358e2068eca9bdead2b93f3b96dcaf890d8). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `case class SequenceNumberRange(` * `case class SequenceNumberRanges(ranges: Array[SequenceNumberRange]) ` * `class KinesisBackedBlockRDDPartition(` * `class KinesisBackedBlockRDD(` * `class KinesisSequenceRangeIterator(` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7635#issuecomment-124390048 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user davies commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35401288 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala --- @@ -149,31 +158,141 @@ private[joins] object HashedRelation { } } +/** + * An extended CompactBuffer that could grow and update. + */ +private[joins] class MutableCompactBuffer[T: ClassTag] extends CompactBuffer[T] { + override def growToSize(newSize: Int): Unit = super.growToSize(newSize) + override def update(i: Int, v: T): Unit = super.update(i, v) +} /** * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a * sequence of values. - * - * TODO(davies): use BytesToBytesMap */ private[joins] final class UnsafeHashedRelation( private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) extends HashedRelation with Externalizable { - def this() = this(null) // Needed for serialization + private[joins] def this() = this(null) // Needed for serialization + + // Use BytesToBytesMap in executor for better performance (it's created when deserialization) + @transient private[this] var binaryMap: BytesToBytesMap = _ + + // A pool of compact buffers to reduce memory garbage + @transient private[this] val bufferPool = new ThreadLocal[MutableCompactBuffer[UnsafeRow]] - override def get(key: InternalRow): CompactBuffer[InternalRow] = { + override def get(key: InternalRow): Seq[InternalRow] = { val unsafeKey = key.asInstanceOf[UnsafeRow] -// Thanks to type eraser -hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]] + +if (binaryMap != null) { + // Used in Broadcast join + val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, +unsafeKey.getSizeInBytes) + if (loc.isDefined) { +// thread-local buffer +var buffer = bufferPool.get() +if (buffer == null) { + buffer = new MutableCompactBuffer[UnsafeRow] + bufferPool.set(buffer) +} + +val base = loc.getValueAddress.getBaseObject +var offset = loc.getValueAddress.getBaseOffset +val last = loc.getValueAddress.getBaseOffset + loc.getValueLength +var i = 0 +while (offset last) { + val numFields = PlatformDependent.UNSAFE.getInt(base, offset) + val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 4) + offset += 8 + + // try to re-use the UnsafeRow in buffer, to reduce garbage + buffer.growToSize(i + 1) + if (buffer(i) == null) { +buffer(i) = new UnsafeRow + } + buffer(i).pointTo(base, offset, numFields, sizeInBytes, null) + i += 1 + offset += sizeInBytes +} +buffer + } else { +null + } + +} else { + // Use the JavaHashMap in Local mode or ShuffleHashJoin + hashTable.get(unsafeKey) +} } override def writeExternal(out: ObjectOutput): Unit = { -writeBytes(out, SparkSqlSerializer.serialize(hashTable)) +out.writeInt(hashTable.size()) + +val iter = hashTable.entrySet().iterator() +while (iter.hasNext) { + val entry = iter.next() + val key = entry.getKey + val values = entry.getValue + + // write all the values as single byte array + var totalSize = 0L + var i = 0 + while (i values.size) { +totalSize += values(i).getSizeInBytes + 4 + 4 +i += 1 + } + assert(totalSize Integer.MAX_VALUE, values are too big) + + // [key size] [values size] [key bytes] [values bytes] + out.writeInt(key.getSizeInBytes) + out.writeInt(totalSize.toInt) + out.write(key.getBytes) + i = 0 + while (i values.size) { +// [num of fields] [num of bytes] [row bytes] +// write the integer in native order, so they can be read by UNSAFE.getInt() +if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { + out.writeInt(values(i).length()) + out.writeInt(values(i).getSizeInBytes) +} else { + out.writeInt(Integer.reverseBytes(values(i).length())) + out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes)) +} +out.write(values(i).getBytes) +i += 1 + } +} } override def readExternal(in: ObjectInput): Unit = { -hashTable = SparkSqlSerializer.deserialize(readBytes(in)) +val
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user davies commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35401411 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala --- @@ -65,18 +72,16 @@ trait HashJoin { { new Iterator[InternalRow] { private[this] var currentStreamedRow: InternalRow = _ - private[this] var currentHashMatches: CompactBuffer[InternalRow] = _ + private[this] var currentHashMatches: Seq[InternalRow] = _ --- End diff -- cc @JoshRosen --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9304] [BUILD] Improve backwards compati...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7639#issuecomment-124394692 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35401495 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala --- @@ -149,31 +158,141 @@ private[joins] object HashedRelation { } } +/** + * An extended CompactBuffer that could grow and update. + */ +private[joins] class MutableCompactBuffer[T: ClassTag] extends CompactBuffer[T] { + override def growToSize(newSize: Int): Unit = super.growToSize(newSize) + override def update(i: Int, v: T): Unit = super.update(i, v) +} /** * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a * sequence of values. - * - * TODO(davies): use BytesToBytesMap */ private[joins] final class UnsafeHashedRelation( private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) extends HashedRelation with Externalizable { - def this() = this(null) // Needed for serialization + private[joins] def this() = this(null) // Needed for serialization + + // Use BytesToBytesMap in executor for better performance (it's created when deserialization) + @transient private[this] var binaryMap: BytesToBytesMap = _ + + // A pool of compact buffers to reduce memory garbage + @transient private[this] val bufferPool = new ThreadLocal[MutableCompactBuffer[UnsafeRow]] - override def get(key: InternalRow): CompactBuffer[InternalRow] = { + override def get(key: InternalRow): Seq[InternalRow] = { val unsafeKey = key.asInstanceOf[UnsafeRow] -// Thanks to type eraser -hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]] + +if (binaryMap != null) { + // Used in Broadcast join + val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, +unsafeKey.getSizeInBytes) + if (loc.isDefined) { +// thread-local buffer +var buffer = bufferPool.get() +if (buffer == null) { + buffer = new MutableCompactBuffer[UnsafeRow] + bufferPool.set(buffer) +} + +val base = loc.getValueAddress.getBaseObject +var offset = loc.getValueAddress.getBaseOffset +val last = loc.getValueAddress.getBaseOffset + loc.getValueLength +var i = 0 +while (offset last) { + val numFields = PlatformDependent.UNSAFE.getInt(base, offset) + val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 4) + offset += 8 + + // try to re-use the UnsafeRow in buffer, to reduce garbage + buffer.growToSize(i + 1) + if (buffer(i) == null) { +buffer(i) = new UnsafeRow + } + buffer(i).pointTo(base, offset, numFields, sizeInBytes, null) + i += 1 + offset += sizeInBytes +} +buffer + } else { +null + } + +} else { + // Use the JavaHashMap in Local mode or ShuffleHashJoin + hashTable.get(unsafeKey) +} } override def writeExternal(out: ObjectOutput): Unit = { -writeBytes(out, SparkSqlSerializer.serialize(hashTable)) +out.writeInt(hashTable.size()) + +val iter = hashTable.entrySet().iterator() +while (iter.hasNext) { + val entry = iter.next() + val key = entry.getKey + val values = entry.getValue + + // write all the values as single byte array + var totalSize = 0L + var i = 0 + while (i values.size) { +totalSize += values(i).getSizeInBytes + 4 + 4 +i += 1 + } + assert(totalSize Integer.MAX_VALUE, values are too big) + + // [key size] [values size] [key bytes] [values bytes] + out.writeInt(key.getSizeInBytes) + out.writeInt(totalSize.toInt) + out.write(key.getBytes) + i = 0 + while (i values.size) { +// [num of fields] [num of bytes] [row bytes] +// write the integer in native order, so they can be read by UNSAFE.getInt() +if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { + out.writeInt(values(i).length()) + out.writeInt(values(i).getSizeInBytes) +} else { + out.writeInt(Integer.reverseBytes(values(i).length())) + out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes)) +} +out.write(values(i).getBytes) +i += 1 + } +} } override def readExternal(in: ObjectInput): Unit = { -hashTable = SparkSqlSerializer.deserialize(readBytes(in)) +val
[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7305#issuecomment-124400159 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7305#issuecomment-124399958 [Test build #87 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SlowSparkPullRequestBuilder/87/console) for PR 7305 at commit [`de3423f`](https://github.com/apache/spark/commit/de3423fa4327690f63bf5e6ab8bbee6955cc0da9). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][MLLIB][SPARK-4675][SPARK-4823]RowSimilar...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/6213#issuecomment-124399886 Build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][MLLIB][SPARK-4675][SPARK-4823]RowSimilar...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/6213#issuecomment-124399778 Build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...
Github user ericl commented on a diff in the pull request: https://github.com/apache/spark/pull/7574#discussion_r35397462 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala --- @@ -130,9 +173,52 @@ class RFormula(override val uid: String) Label column already exists and is not of type DoubleType.) } - private def hasLabelCol(schema: StructType): Boolean = { -schema.map(_.name).contains($(labelCol)) + private def featureTransformer(schema: StructType): Transformer = { +// StringType terms and terms representing interactions need to be encoded before assembly. +// TODO(ekl) add support for feature interactions +var encoderStages = Seq[Transformer]() +var tempColumns = Seq[String]() +val encodedTerms = parsedFormula.terms.map { term = + schema(term) match { +case column if column.dataType == StringType = + val encodedTerm = term + _onehot_ + uid + val indexer = factorLevels(term) + val indexCol = indexer.getOrDefault(indexer.outputCol) + encoderStages :+= indexer + encoderStages :+= new OneHotEncoder() +.setInputCol(indexCol) +.setOutputCol(encodedTerm) + tempColumns :+= encodedTerm + tempColumns :+= indexCol + encodedTerm +case _ = + term + } +} +encoderStages :+= new VectorAssembler(uid) + .setInputCols(encodedTerms.toArray) + .setOutputCol($(featuresCol)) +encoderStages :+= new ColumnPruner(tempColumns.toSet) +new PipelineModel(uid, encoderStages.toArray) + } +} + +/** + * Utility transformer for removing temporary columns from a DataFrame. + */ +private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer { --- End diff -- Done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...
Github user ericl commented on a diff in the pull request: https://github.com/apache/spark/pull/7574#discussion_r35397464 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala --- @@ -130,9 +173,52 @@ class RFormula(override val uid: String) Label column already exists and is not of type DoubleType.) } - private def hasLabelCol(schema: StructType): Boolean = { -schema.map(_.name).contains($(labelCol)) + private def featureTransformer(schema: StructType): Transformer = { +// StringType terms and terms representing interactions need to be encoded before assembly. +// TODO(ekl) add support for feature interactions +var encoderStages = Seq[Transformer]() +var tempColumns = Seq[String]() +val encodedTerms = parsedFormula.terms.map { term = + schema(term) match { +case column if column.dataType == StringType = + val encodedTerm = term + _onehot_ + uid + val indexer = factorLevels(term) + val indexCol = indexer.getOrDefault(indexer.outputCol) + encoderStages :+= indexer + encoderStages :+= new OneHotEncoder() +.setInputCol(indexCol) +.setOutputCol(encodedTerm) + tempColumns :+= encodedTerm + tempColumns :+= indexCol + encodedTerm +case _ = + term + } +} +encoderStages :+= new VectorAssembler(uid) + .setInputCols(encodedTerms.toArray) + .setOutputCol($(featuresCol)) +encoderStages :+= new ColumnPruner(tempColumns.toSet) +new PipelineModel(uid, encoderStages.toArray) + } +} + +/** + * Utility transformer for removing temporary columns from a DataFrame. + */ +private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer { + override val uid = Identifiable.randomUID(columnPruner) + override def transform(dataset: DataFrame): DataFrame = { +var res: DataFrame = dataset +for (column - columnsToPrune) { + res = res.drop(column) --- End diff -- Done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...
Github user ericl commented on a diff in the pull request: https://github.com/apache/spark/pull/7574#discussion_r35397461 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala --- @@ -62,19 +77,60 @@ class RFormula(override val uid: String) /** @group getParam */ def getFormula: String = $(formula) - /** @group getParam */ - def setFeaturesCol(value: String): this.type = set(featuresCol, value) + override def fit(dataset: DataFrame): RFormulaModel = { +require(parsedFormula.isDefined, Must call setFormula() first.) +val factorLevels = parsedFormula.get.terms.flatMap { term = + dataset.schema(term) match { +case column if column.dataType == StringType = + val idxTerm = term + _idx_ + uid + val indexer = new StringIndexer().setInputCol(term).setOutputCol(idxTerm) + Some(term - indexer.fit(dataset)) +case _ = + None --- End diff -- Done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
GitHub user JoshRosen opened a pull request: https://github.com/apache/spark/pull/7635 [SPARK-9303] Decimal should use java.math.Decimal directly instead of using Scala wrapper Spark SQL's `Decimal` class should use Java's BigDecimal instead of Scala's, since this removes a layer of object allocation and works around an issue where Scala's BigDecimal.hashCode() can lead to OOMs (see https://issues.scala-lang.org/browse/SI-6173). You can merge this pull request into a Git repository by running: $ git pull https://github.com/JoshRosen/spark SPARK-9303 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/7635.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #7635 commit d7a35358e2068eca9bdead2b93f3b96dcaf890d8 Author: Josh Rosen joshro...@databricks.com Date: 2015-07-24T06:02:13Z [SPARK-9303] Decimal should use java.math.Decimal directly instead of via Scala wrapper --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7635#discussion_r35397756 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala --- @@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with Serializable { this } - def toBigDecimal: BigDecimal = { + def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal) --- End diff -- Decimal is not meant to be public. I think it just accidentally became public. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7636#issuecomment-124345665 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7636#issuecomment-124345653 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7637#issuecomment-124345647 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7637#issuecomment-124345667 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124345657 [Test build #38321 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38321/console) for PR 7634 at commit [`6a91f32`](https://github.com/apache/spark/commit/6a91f32112c02a507319bc7d090b9256739181e4). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `case class ChangeDecimalPrecision(child: Expression) extends UnaryExpression ` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124345679 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7574#issuecomment-124346736 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9230] [ML] Support StringType features ...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7574#issuecomment-124346148 [Test build #38316 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38316/console) for PR 7574 at commit [`c302a2c`](https://github.com/apache/spark/commit/c302a2c40088de89feb37964f182de33279df818). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase ` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124354807 [Test build #38326 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38326/consoleFull) for PR 7634 at commit [`65b251c`](https://github.com/apache/spark/commit/65b251cf40fd9975d3c194e49ba27038f351069c). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/7520#discussion_r35398948 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala --- @@ -86,19 +86,10 @@ private[orc] class OrcOutputWriter( TypeInfoUtils.getTypeInfoFromTypeString( HiveMetastoreTypes.toMetastoreType(dataSchema)) -TypeInfoUtils - .getStandardJavaObjectInspectorFromTypeInfo(typeInfo) - .asInstanceOf[StructObjectInspector] +OrcStruct.createObjectInspector(typeInfo.asInstanceOf[StructTypeInfo]) + .asInstanceOf[SettableStructObjectInspector] --- End diff -- `OrcStruct.createObjectInspector ` will return a `OrcStructInspector` for STRUCT data type. It is a `SettableStructObjectInspector`. Because we pass a `StructTypeInfo` when calling `createObjectInspector`, I think it should be safe. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124353841 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35399269 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala --- @@ -149,31 +158,141 @@ private[joins] object HashedRelation { } } +/** + * An extended CompactBuffer that could grow and update. + */ +private[joins] class MutableCompactBuffer[T: ClassTag] extends CompactBuffer[T] { + override def growToSize(newSize: Int): Unit = super.growToSize(newSize) + override def update(i: Int, v: T): Unit = super.update(i, v) +} /** * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a * sequence of values. - * - * TODO(davies): use BytesToBytesMap */ private[joins] final class UnsafeHashedRelation( private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) extends HashedRelation with Externalizable { - def this() = this(null) // Needed for serialization + private[joins] def this() = this(null) // Needed for serialization + + // Use BytesToBytesMap in executor for better performance (it's created when deserialization) + @transient private[this] var binaryMap: BytesToBytesMap = _ + + // A pool of compact buffers to reduce memory garbage + @transient private[this] val bufferPool = new ThreadLocal[MutableCompactBuffer[UnsafeRow]] --- End diff -- buffer pool has a special meaning in databases. we should pick a different name. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...
Github user tdas commented on a diff in the pull request: https://github.com/apache/spark/pull/7600#discussion_r35399262 --- Diff: streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala --- @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.scheduler + +import java.util.concurrent.atomic.AtomicLong + +import scala.concurrent.{ExecutionContext, Future} + +import org.apache.spark.streaming.scheduler.rate.RateEstimator +import org.apache.spark.util.ThreadUtils + +/** + * A StreamingListener that receives batch completion updates, and maintains + * an estimate of the speed at which this stream should ingest messages, + * given an estimate computation from a `RateEstimator` + */ +private [streaming] abstract class RateController(val streamUID: Int, rateEstimator: RateEstimator) + extends StreamingListener with Serializable { + + protected def publish(rate: Long): Unit + + // Used to compute publish the rate update asynchronously + @transient + implicit private val executionContext = ExecutionContext.fromExecutorService( +ThreadUtils.newDaemonSingleThreadExecutor(stream-rate-update)) + + private val rateLimit: AtomicLong = new AtomicLong(-1L) + + /** + * Compute the new rate limit and publish it asynchronously. + */ + private def computeAndPublish(time: Long, elems: Long, workDelay: Long, waitDelay: Long): Unit = +Future[Unit] { + val newSpeed = rateEstimator.compute(time, elems, workDelay, waitDelay) + newSpeed foreach { s = --- End diff -- We dont generally use postfix notation. use `newSpeed.foraech` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9293] Analysis should check that set op...
Github user cloud-fan commented on the pull request: https://github.com/apache/spark/pull/7631#issuecomment-124362448 I tried it locally, hive will report error if will union 2 `select` with different output length. But in our test we union 2 `InsertIntoTable` operations which seems a special case for hive(or maybe the output of `InsertIntoTable` is `Nil` in hive?). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...
Github user tdas commented on a diff in the pull request: https://github.com/apache/spark/pull/7600#discussion_r35399267 --- Diff: streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala --- @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.scheduler + +import scala.collection.mutable +import scala.reflect.ClassTag +import scala.util.control.NonFatal + +import org.scalatest.Matchers._ +import org.scalatest.concurrent.Eventually._ +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.streaming._ +import org.apache.spark.streaming.scheduler.rate.RateEstimator + + + --- End diff -- nit: extra line --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...
Github user tdas commented on a diff in the pull request: https://github.com/apache/spark/pull/7600#discussion_r35399265 --- Diff: streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala --- @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.scheduler.rate + +import org.apache.spark.SparkConf +import org.apache.spark.SparkException + +/** + * A component that estimates the rate at wich an InputDStream should ingest + * elements, based on updates at every batch completion. + */ +private[streaming] trait RateEstimator extends Serializable { + + /** + * Computes the number of elements the stream attached to this `RateEstimator` + * should ingest per second, given an update on the size and completion + * times of the latest batch. + * + * @param time The timetamp of the current batch interval that just finished + * @param elements The number of elements that were processed in this batch + * @param processingDelay The time in ms that took for the job to complete + * @param schedulingDelay The time in ms that the job spent in the scheduling queue + */ + def compute( + time: Long, + elements: Long, + processingDelay: Long, + schedulingDelay: Long): Option[Double] +} + +object RateEstimator { + + /** + * Return a new RateEstimator based on the value of `spark.streaming.RateEstimator`. + * + * @return None if there is no configured estimator, otherwise an instance of RateEstimator + * @throws IllegalArgumentException if there is a configured RateEstimator that doesn't match any + * known estimators. + */ + def makeEstimator(conf: SparkConf): Option[RateEstimator] = +conf.getOption(spark.streaming.RateEstimator) map { estimator = --- End diff -- Let's make the configuration params as follows `spark.streaming.backpressure.enable = true/false` to enable/disable the whole feature. false by default in 1.5. Also to the docs/configuration.md. `spark.streaming.backpressure.rateEstimator = pid` to specify which algorithm to use for estimating. DO NOT add this to the docs/configuration. Basically, the scope `spark.streaming.backpressure` will contain on backpressure related configuration. Accordingly, the RateController in InputDStreams will be created as `RateController.create()`, and the estimator will be created by `RateEstimator.create`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...
Github user tdas commented on a diff in the pull request: https://github.com/apache/spark/pull/7600#discussion_r35399259 --- Diff: streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala --- @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.scheduler + +import java.util.concurrent.atomic.AtomicLong + +import scala.concurrent.{ExecutionContext, Future} + +import org.apache.spark.streaming.scheduler.rate.RateEstimator +import org.apache.spark.util.ThreadUtils + +/** + * A StreamingListener that receives batch completion updates, and maintains + * an estimate of the speed at which this stream should ingest messages, + * given an estimate computation from a `RateEstimator` + */ +private [streaming] abstract class RateController(val streamUID: Int, rateEstimator: RateEstimator) + extends StreamingListener with Serializable { + + protected def publish(rate: Long): Unit + + // Used to compute publish the rate update asynchronously + @transient + implicit private val executionContext = ExecutionContext.fromExecutorService( +ThreadUtils.newDaemonSingleThreadExecutor(stream-rate-update)) + + private val rateLimit: AtomicLong = new AtomicLong(-1L) + + /** + * Compute the new rate limit and publish it asynchronously. + */ + private def computeAndPublish(time: Long, elems: Long, workDelay: Long, waitDelay: Long): Unit = +Future[Unit] { + val newSpeed = rateEstimator.compute(time, elems, workDelay, waitDelay) --- End diff -- newSpeed -- newRate --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7520#issuecomment-124365064 [Test build #38327 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38327/consoleFull) for PR 7520 at commit [`96796da`](https://github.com/apache/spark/commit/96796da0abaf695bcda52fbd395413bf6e9520de). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [WIP][SPARK-8977][Streaming] Defines the RateE...
Github user tdas commented on the pull request: https://github.com/apache/spark/pull/7600#issuecomment-124363987 Time to remove the WIP from the title :) Its looking good. Will take another pass once these comments are addressed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9209] Using executor allocation, a exec...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7559#issuecomment-124372120 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9043] Serialize key, value and combiner...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7403#issuecomment-124379596 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9043] Serialize key, value and combiner...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7403#issuecomment-124379407 [Test build #38310 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38310/console) for PR 7403 at commit [`d07b771`](https://github.com/apache/spark/commit/d07b771f787166257691d8c4e9bce2adfc189860). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](` * `class CoGroupedRDD[K: ClassTag](@transient var rdds: Seq[RDD[_ : Product2[K, _]]],` * `class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9285][SQL] Remove InternalRow's inherit...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7626#issuecomment-124390396 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9285][SQL] Remove InternalRow's inherit...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7626#issuecomment-124390411 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-5155] [PySpark] [Streaming] Mqtt stream...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/4229#issuecomment-124390582 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-5155] [PySpark] [Streaming] Mqtt stream...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/4229#issuecomment-124390541 [Test build #86 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SlowSparkPullRequestBuilder/86/console) for PR 4229 at commit [`87fc677`](https://github.com/apache/spark/commit/87fc6771586aaf025c2810b5fa3a160a9773c4d2). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `class MQTTUtils(object):` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user rxin commented on the pull request: https://github.com/apache/spark/pull/7592#issuecomment-124398910 @davies this is ok, but what I had in mind was pretty different. What I had in mind was to create a version of the join operator, that could take more advantage of what unsafe provides. There are still quite a bit of conversions involved right now. HashRelation is also limiting you, due to the shared variable across multiple threads. I think we will need to create a new join operator soon anyway, in order to avoid the join row conversion. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-4352][YARN][WIP] Incorporate locality p...
Github user jerryshao commented on a diff in the pull request: https://github.com/apache/spark/pull/6394#discussion_r35401677 --- Diff: core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala --- @@ -28,7 +28,10 @@ private[spark] trait ExecutorAllocationClient { * This can result in canceling pending requests or filing additional requests. * @return whether the request is acknowledged by the cluster manager. */ - private[spark] def requestTotalExecutors(numExecutors: Int): Boolean + private[spark] def requestTotalExecutors( + numExecutors: Int, --- End diff -- Seems I miss this comment, will update the code. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124342521 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124342480 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7583#issuecomment-124341890 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-5373][SQL] use transformDown when rewri...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7583#issuecomment-124341684 [Test build #38307 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38307/console) for PR 7583 at commit [`9d59628`](https://github.com/apache/spark/commit/9d59628bce695a2825c74e5f5d5c2fa6ab40c734). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user davies commented on a diff in the pull request: https://github.com/apache/spark/pull/7635#discussion_r35397974 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala --- @@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with Serializable { this } - def toBigDecimal: BigDecimal = { + def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal) --- End diff -- I think user's code should never touch Decimal --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8951][SparkR] support Unicode character...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7494#issuecomment-124343702 [Test build #38304 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38304/console) for PR 7494 at commit [`bc469d8`](https://github.com/apache/spark/commit/bc469d8ae91024675abb37b76a1354c99206c5c2). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8951][SparkR] support Unicode character...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7494#issuecomment-124343874 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9069] [SQL] follow up
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7634#issuecomment-124342964 [Test build #38321 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38321/consoleFull) for PR 7634 at commit [`6a91f32`](https://github.com/apache/spark/commit/6a91f32112c02a507319bc7d090b9256739181e4). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7305#issuecomment-124345120 [Test build #87 has started](https://amplab.cs.berkeley.edu/jenkins/job/SlowSparkPullRequestBuilder/87/consoleFull) for PR 7305 at commit [`de3423f`](https://github.com/apache/spark/commit/de3423fa4327690f63bf5e6ab8bbee6955cc0da9). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...
GitHub user rxin opened a pull request: https://github.com/apache/spark/pull/7636 [SPARK-9200][SQL] Don't implicitly cast non-atomic types to string type. You can merge this pull request into a Git repository by running: $ git pull https://github.com/rxin/spark complex-string-implicit-cast Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/7636.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #7636 commit 3e67327e72ac4fe6f9e1ade4fe56d140fa2303d1 Author: Reynold Xin r...@databricks.com Date: 2015-07-24T06:28:01Z [SPARK-9200][SQL] Don't implicitly cast non-atomic types to string type. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8930][SQL] Support a star '*' in genera...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7305#issuecomment-124345056 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9200][SQL] Don't implicitly cast non-at...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7636#issuecomment-124345792 [Test build #38325 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38325/consoleFull) for PR 7636 at commit [`3e67327`](https://github.com/apache/spark/commit/3e67327e72ac4fe6f9e1ade4fe56d140fa2303d1). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [build] Enable memory leak detection for Tungs...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7637#issuecomment-124345771 [Test build #38324 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38324/consoleFull) for PR 7637 at commit [`34bc3ef`](https://github.com/apache/spark/commit/34bc3efa60480f3af73cec4f85132e1afc2c5c7c). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9303] Decimal should use java.math.Deci...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7635#discussion_r35398709 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala --- @@ -135,19 +137,19 @@ final class Decimal extends Ordered[Decimal] with Serializable { this } - def toBigDecimal: BigDecimal = { + def toBigDecimal: BigDecimal = BigDecimal(toJavaBigDecimal) --- End diff -- We should remove it, but it is ok to leave it in for now I think. Most likely this entire class will look very different soon... --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9152][SQL] Implement code generation fo...
Github user viirya commented on the pull request: https://github.com/apache/spark/pull/7561#issuecomment-124356779 ping @davies @rxin --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9170][SQL] Instead of StandardStructObj...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/7520#discussion_r35399063 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala --- @@ -120,15 +111,11 @@ private[orc] class OrcOutputWriter( } override def write(row: Row): Unit = { -var i = 0 -while (i row.length) { - reusableOutputBuffer(i) = wrappers(i)(row(i)) - i += 1 -} +val orcRow = wrap(row, structOI) --- End diff -- Agreed. It will be a concern. I will update this part to reuse an `OrcStruct`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9295] Analysis should detect sorting on...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7633#issuecomment-124365696 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9065][Streaming][PySpark] Add MessageHa...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7410#issuecomment-124365477 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9295] Analysis should detect sorting on...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7633#issuecomment-124365550 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9065][Streaming][PySpark] Add MessageHa...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7410#issuecomment-124365161 [Test build #38309 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38309/console) for PR 7410 at commit [`a09423e`](https://github.com/apache/spark/commit/a09423ed6ce4ab28d8e52e4ba0a8d80d1ae098a1). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * ` case class PythonMessageAndMetadata(` * ` class PythonMessageAndMetadataPickler extends IObjectPickler ` * `class KafkaMessageAndMetadata(object):` * `case class ChangePrecision(child: Expression) extends UnaryExpression ` * `abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable with Unevaluable ` * `abstract class AggregateFunction1 extends LeafExpression with Serializable ` * `case class DecimalType(precision: Int, scale: Int) extends FractionalType ` * ` case class DecimalConversion(precision: Int, scale: Int) extends JDBCConversion` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7638#issuecomment-124383061 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/7638#issuecomment-124383030 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...
GitHub user rxin opened a pull request: https://github.com/apache/spark/pull/7638 [SPARK-9305] Rename org.apache.spark.Row to Item. It's a thing used in test cases, but named Row. Pretty annoying because everytime I search for Row, it shows up before the Spark SQL Row, which is what a developer wants most of the time. You can merge this pull request into a Git repository by running: $ git pull https://github.com/rxin/spark remove-row Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/7638.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #7638 commit aeda52d4635aaf850355d91313b437161117288b Author: Reynold Xin r...@databricks.com Date: 2015-07-24T07:15:30Z [SPARK-9305] Rename org.apache.spark.Row to Item. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9305] Rename org.apache.spark.Row to It...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/7638#issuecomment-124385556 [Test build #38330 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38330/consoleFull) for PR 7638 at commit [`aeda52d`](https://github.com/apache/spark/commit/aeda52d4635aaf850355d91313b437161117288b). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8564][Streaming]Add the Python API for ...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/6955#issuecomment-124389877 [Test build #38312 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/38312/console) for PR 6955 at commit [`6c37395`](https://github.com/apache/spark/commit/6c37395b2a4d0d56aa4a8d9ec76899c6066b29ab). * This patch **fails Spark unit tests**. * This patch merges cleanly. * This patch adds the following public classes _(experimental)_: * `class KinesisUtils(object):` * `class InitialPositionInStream(object):` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-8564][Streaming]Add the Python API for ...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/6955#issuecomment-124389905 Merged build finished. Test FAILed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35400866 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala --- @@ -149,31 +158,141 @@ private[joins] object HashedRelation { } } +/** + * An extended CompactBuffer that could grow and update. + */ +private[joins] class MutableCompactBuffer[T: ClassTag] extends CompactBuffer[T] { + override def growToSize(newSize: Int): Unit = super.growToSize(newSize) + override def update(i: Int, v: T): Unit = super.update(i, v) +} /** * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a * sequence of values. - * - * TODO(davies): use BytesToBytesMap */ private[joins] final class UnsafeHashedRelation( private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) extends HashedRelation with Externalizable { - def this() = this(null) // Needed for serialization + private[joins] def this() = this(null) // Needed for serialization + + // Use BytesToBytesMap in executor for better performance (it's created when deserialization) + @transient private[this] var binaryMap: BytesToBytesMap = _ + + // A pool of compact buffers to reduce memory garbage + @transient private[this] val bufferPool = new ThreadLocal[MutableCompactBuffer[UnsafeRow]] - override def get(key: InternalRow): CompactBuffer[InternalRow] = { + override def get(key: InternalRow): Seq[InternalRow] = { val unsafeKey = key.asInstanceOf[UnsafeRow] -// Thanks to type eraser -hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]] + +if (binaryMap != null) { + // Used in Broadcast join + val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, +unsafeKey.getSizeInBytes) + if (loc.isDefined) { +// thread-local buffer +var buffer = bufferPool.get() +if (buffer == null) { + buffer = new MutableCompactBuffer[UnsafeRow] + bufferPool.set(buffer) +} + +val base = loc.getValueAddress.getBaseObject +var offset = loc.getValueAddress.getBaseOffset +val last = loc.getValueAddress.getBaseOffset + loc.getValueLength +var i = 0 +while (offset last) { + val numFields = PlatformDependent.UNSAFE.getInt(base, offset) + val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 4) + offset += 8 + + // try to re-use the UnsafeRow in buffer, to reduce garbage + buffer.growToSize(i + 1) + if (buffer(i) == null) { +buffer(i) = new UnsafeRow + } + buffer(i).pointTo(base, offset, numFields, sizeInBytes, null) + i += 1 + offset += sizeInBytes +} +buffer + } else { +null + } + +} else { + // Use the JavaHashMap in Local mode or ShuffleHashJoin + hashTable.get(unsafeKey) +} } override def writeExternal(out: ObjectOutput): Unit = { -writeBytes(out, SparkSqlSerializer.serialize(hashTable)) +out.writeInt(hashTable.size()) + +val iter = hashTable.entrySet().iterator() +while (iter.hasNext) { + val entry = iter.next() + val key = entry.getKey + val values = entry.getValue + + // write all the values as single byte array + var totalSize = 0L + var i = 0 + while (i values.size) { +totalSize += values(i).getSizeInBytes + 4 + 4 +i += 1 + } + assert(totalSize Integer.MAX_VALUE, values are too big) + + // [key size] [values size] [key bytes] [values bytes] + out.writeInt(key.getSizeInBytes) + out.writeInt(totalSize.toInt) + out.write(key.getBytes) + i = 0 + while (i values.size) { +// [num of fields] [num of bytes] [row bytes] +// write the integer in native order, so they can be read by UNSAFE.getInt() +if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { + out.writeInt(values(i).length()) + out.writeInt(values(i).getSizeInBytes) +} else { + out.writeInt(Integer.reverseBytes(values(i).length())) + out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes)) +} +out.write(values(i).getBytes) +i += 1 + } +} } override def readExternal(in: ObjectInput): Unit = { -hashTable = SparkSqlSerializer.deserialize(readBytes(in)) +val
[GitHub] spark pull request: [SPARK-9247] [SQL] Use BytesToBytesMap for bro...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35400801 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala --- @@ -149,31 +158,141 @@ private[joins] object HashedRelation { } } +/** + * An extended CompactBuffer that could grow and update. + */ +private[joins] class MutableCompactBuffer[T: ClassTag] extends CompactBuffer[T] { + override def growToSize(newSize: Int): Unit = super.growToSize(newSize) + override def update(i: Int, v: T): Unit = super.update(i, v) +} /** * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a * sequence of values. - * - * TODO(davies): use BytesToBytesMap */ private[joins] final class UnsafeHashedRelation( private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) extends HashedRelation with Externalizable { - def this() = this(null) // Needed for serialization + private[joins] def this() = this(null) // Needed for serialization + + // Use BytesToBytesMap in executor for better performance (it's created when deserialization) + @transient private[this] var binaryMap: BytesToBytesMap = _ + + // A pool of compact buffers to reduce memory garbage + @transient private[this] val bufferPool = new ThreadLocal[MutableCompactBuffer[UnsafeRow]] - override def get(key: InternalRow): CompactBuffer[InternalRow] = { + override def get(key: InternalRow): Seq[InternalRow] = { val unsafeKey = key.asInstanceOf[UnsafeRow] -// Thanks to type eraser -hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]] + +if (binaryMap != null) { + // Used in Broadcast join + val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, +unsafeKey.getSizeInBytes) + if (loc.isDefined) { +// thread-local buffer +var buffer = bufferPool.get() +if (buffer == null) { + buffer = new MutableCompactBuffer[UnsafeRow] + bufferPool.set(buffer) +} + +val base = loc.getValueAddress.getBaseObject +var offset = loc.getValueAddress.getBaseOffset +val last = loc.getValueAddress.getBaseOffset + loc.getValueLength +var i = 0 +while (offset last) { + val numFields = PlatformDependent.UNSAFE.getInt(base, offset) + val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 4) + offset += 8 + + // try to re-use the UnsafeRow in buffer, to reduce garbage + buffer.growToSize(i + 1) + if (buffer(i) == null) { +buffer(i) = new UnsafeRow + } + buffer(i).pointTo(base, offset, numFields, sizeInBytes, null) + i += 1 + offset += sizeInBytes +} +buffer + } else { +null + } + +} else { + // Use the JavaHashMap in Local mode or ShuffleHashJoin + hashTable.get(unsafeKey) +} } override def writeExternal(out: ObjectOutput): Unit = { -writeBytes(out, SparkSqlSerializer.serialize(hashTable)) +out.writeInt(hashTable.size()) + +val iter = hashTable.entrySet().iterator() +while (iter.hasNext) { + val entry = iter.next() + val key = entry.getKey + val values = entry.getValue + + // write all the values as single byte array + var totalSize = 0L + var i = 0 + while (i values.size) { +totalSize += values(i).getSizeInBytes + 4 + 4 +i += 1 + } + assert(totalSize Integer.MAX_VALUE, values are too big) + + // [key size] [values size] [key bytes] [values bytes] + out.writeInt(key.getSizeInBytes) + out.writeInt(totalSize.toInt) + out.write(key.getBytes) + i = 0 + while (i values.size) { +// [num of fields] [num of bytes] [row bytes] +// write the integer in native order, so they can be read by UNSAFE.getInt() +if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { + out.writeInt(values(i).length()) + out.writeInt(values(i).getSizeInBytes) +} else { + out.writeInt(Integer.reverseBytes(values(i).length())) + out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes)) +} +out.write(values(i).getBytes) +i += 1 + } +} } override def readExternal(in: ObjectInput): Unit = { -hashTable = SparkSqlSerializer.deserialize(readBytes(in)) +val