Github user smurching commented on a diff in the pull request: https://github.com/apache/spark/pull/19186#discussion_r138712707 --- Diff: mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala --- @@ -483,24 +488,24 @@ class LogisticRegression @Since("1.2.0") ( this } - override protected[spark] def train(dataset: Dataset[_]): LogisticRegressionModel = { - val handlePersistence = dataset.storageLevel == StorageLevel.NONE - train(dataset, handlePersistence) - } - - protected[spark] def train( - dataset: Dataset[_], - handlePersistence: Boolean): LogisticRegressionModel = { + protected[spark] def train(dataset: Dataset[_]): LogisticRegressionModel = { val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map { case Row(label: Double, weight: Double, features: Vector) => Instance(label, weight, features) } - if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) + if (dataset.storageLevel == StorageLevel.NONE) { + if ($(handlePersistence)) { + instances.persist(StorageLevel.MEMORY_AND_DISK) + } else { + logWarning("The input dataset is uncached, which may hurt performance if its upstreams " + + "are also uncached.") + } + } --- End diff -- Oops, yeah I had forgotten about that (thanks for the catch). One solution could be to extend `HasHandlePersistence` in `Predictor` and check `handlePersistence` / cache uncached data in `Predictor.fit()` instead of `Predictor.train()`. This has the drawback of limiting individual algorithms' ability to customize their caching behavior.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org