Repository: spark Updated Branches: refs/heads/branch-1.5 5186ec8ac -> 2195fec12
[SPARK-11051][CORE] Do not allow local checkpointing after the RDD is materialized and checkpointed JIRA: https://issues.apache.org/jira/browse/SPARK-11051 When a `RDD` is materialized and checkpointed, its partitions and dependencies are cleared. If we allow local checkpointing on it and assign `LocalRDDCheckpointData` to its `checkpointData`. Next time when the RDD is materialized again, the error will be thrown. Author: Liang-Chi Hsieh <vii...@appier.com> Closes #9072 from viirya/no-localcheckpoint-after-checkpoint. (cherry picked from commit a1413b3662250dd5e980e8b1f7c3dc4585ab4766) Signed-off-by: Andrew Or <and...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2195fec1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2195fec1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2195fec1 Branch: refs/heads/branch-1.5 Commit: 2195fec125b2027c734ba4c6940b7fd8dfc5d10b Parents: 5186ec8 Author: Liang-Chi Hsieh <vii...@appier.com> Authored: Mon Oct 19 16:16:31 2015 -0700 Committer: Andrew Or <and...@databricks.com> Committed: Mon Oct 19 16:16:39 2015 -0700 ---------------------------------------------------------------------- .../main/scala/org/apache/spark/rdd/RDD.scala | 35 ++++++++++++++++---- .../org/apache/spark/CheckpointSuite.scala | 4 +++ 2 files changed, 32 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/2195fec1/core/src/main/scala/org/apache/spark/rdd/RDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 081c721..20b115f 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -294,7 +294,11 @@ abstract class RDD[T: ClassTag]( */ private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] = { - if (isCheckpointed) firstParent[T].iterator(split, context) else compute(split, context) + if (isCheckpointedAndMaterialized) { + firstParent[T].iterator(split, context) + } else { + compute(split, context) + } } /** @@ -1526,21 +1530,38 @@ abstract class RDD[T: ClassTag]( persist(LocalRDDCheckpointData.transformStorageLevel(storageLevel), allowOverride = true) } - checkpointData match { - case Some(reliable: ReliableRDDCheckpointData[_]) => logWarning( - "RDD was already marked for reliable checkpointing: overriding with local checkpoint.") - case _ => + // If this RDD is already checkpointed and materialized, its lineage is already truncated. + // We must not override our `checkpointData` in this case because it is needed to recover + // the checkpointed data. If it is overridden, next time materializing on this RDD will + // cause error. + if (isCheckpointedAndMaterialized) { + logWarning("Not marking RDD for local checkpoint because it was already " + + "checkpointed and materialized") + } else { + // Lineage is not truncated yet, so just override any existing checkpoint data with ours + checkpointData match { + case Some(_: ReliableRDDCheckpointData[_]) => logWarning( + "RDD was already marked for reliable checkpointing: overriding with local checkpoint.") + case _ => + } + checkpointData = Some(new LocalRDDCheckpointData(this)) } - checkpointData = Some(new LocalRDDCheckpointData(this)) this } /** - * Return whether this RDD is marked for checkpointing, either reliably or locally. + * Return whether this RDD is checkpointed and materialized, either reliably or locally. */ def isCheckpointed: Boolean = checkpointData.exists(_.isCheckpointed) /** + * Return whether this RDD is checkpointed and materialized, either reliably or locally. + * This is introduced as an alias for `isCheckpointed` to clarify the semantics of the + * return value. Exposed for testing. + */ + private[spark] def isCheckpointedAndMaterialized: Boolean = isCheckpointed + + /** * Return whether this RDD is marked for local checkpointing. * Exposed for testing. */ http://git-wip-us.apache.org/repos/asf/spark/blob/2195fec1/core/src/test/scala/org/apache/spark/CheckpointSuite.scala ---------------------------------------------------------------------- diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala index d343bb9..608f7ca 100644 --- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala @@ -241,9 +241,13 @@ class CheckpointSuite extends SparkFunSuite with LocalSparkContext with Logging val rdd = new BlockRDD[Int](sc, Array[BlockId]()) assert(rdd.partitions.size === 0) assert(rdd.isCheckpointed === false) + assert(rdd.isCheckpointedAndMaterialized === false) checkpoint(rdd, reliableCheckpoint) + assert(rdd.isCheckpointed === false) + assert(rdd.isCheckpointedAndMaterialized === false) assert(rdd.count() === 0) assert(rdd.isCheckpointed === true) + assert(rdd.isCheckpointedAndMaterialized === true) assert(rdd.partitions.size === 0) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org