Yeah, I realized shortly after I sent that message that my use of map in
that code was problematic. This is probably a bit better:
def split[T : ClassManifest](data: RDD[T], p: Double, seed: Long =
System.currentTimeMillis): (RDD[T], RDD[T]) = {
val rand = new java.util.Random(seed)
val partitionSeeds = data.partitions.map(partition => rand.nextLong)
val temp = data.mapPartitionsWithIndex((index, iter) => {
val partitionRand = new java.util.Random(partitionSeeds(index))
iter.map(x => (x, partitionRand.nextDouble))
})
(temp.filter(_._2 <= p).map(_._1), temp.filter(_._2 > p).map(_._1))
}