Hello, I am having an issue with Linear Regression when trying to fit training data to the model. The code below used to work, but it stopped recently. Spark is version 3.2.1.
# Split Data into train and test data train, test = data.randomSplit([0.9, 0.1]) y = 'Build_Rate' # Perform regression with train data assembler = VectorAssembler(inputCols=feature_cols, outputCol="Features") vtrain = assembler.transform(train).select('Features', y) lin_reg = LinearRegression(regParam = 0.0, elasticNetParam = 0.0, solver='normal', featuresCol = 'Features', labelCol = y) model = lin_reg.fit(vtrain) FAILS HERE org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 388.0 failed 4 times, most recent failure: Lost task 0.3 in stage 388.0 (TID 422) (10.139.64.4 executor 0): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMM dd, yyyy hh:mm:ss aa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html The full traceback is attached. The error is confusing me because there are no datetime columns in "train". "vtrain" is just "train" with the feature columns in dense vector form. [cid:image002.png@01D839F5.9EEB4860] Does anyone know how to fix this error? Thanks, Ken Bassett Data Scientist 1451 Marvin Griffin Rd. Augusta, GA 30906 (m) (706) 469-0696 kbass...@textron.com<mailto:kbass...@textron.com> [2019 E-mail Signature]
--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <command-2567512786910096> in <module> ----> 1 model = lin_reg.fit(vtrain) /databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_pyspark.py in patched_method(self, *args, **kwargs) 28 call_succeeded = False 29 try: ---> 30 result = original_method(self, *args, **kwargs) 31 call_succeeded = True 32 return result /databricks/spark/python/pyspark/ml/base.py in fit(self, dataset, params) 159 return self.copy(params)._fit(dataset) 160 else: --> 161 return self._fit(dataset) 162 else: 163 raise TypeError("Params must be either a param map or a list/tuple of param maps, " /databricks/spark/python/pyspark/ml/wrapper.py in _fit(self, dataset) 333 334 def _fit(self, dataset): --> 335 java_model = self._fit_java(dataset) 336 model = self._create_model(java_model) 337 return self._copyValues(model) /databricks/spark/python/pyspark/ml/wrapper.py in _fit_java(self, dataset) 330 """ 331 self._transfer_params_to_java() --> 332 return self._java_obj.fit(dataset._jdf) 333 334 def _fit(self, dataset): /databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 1302 1303 answer = self.gateway_client.send_command(command) -> 1304 return_value = get_return_value( 1305 answer, self.gateway_client, self.target_id, self.name) 1306 /databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 115 def deco(*a, **kw): 116 try: --> 117 return f(*a, **kw) 118 except py4j.protocol.Py4JJavaError as e: 119 converted = convert_exception(e.java_exception) /databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) 325 if answer[1] == REFERENCE_TYPE: --> 326 raise Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". 328 format(target_id, ".", name), value) Py4JJavaError: An error occurred while calling o1033.fit. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 413.0 failed 4 times, most recent failure: Lost task 0.3 in stage 413.0 (TID 461) (10.139.64.4 executor 0): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMM dd, yyyy hh:mm:ss aa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html at org.apache.spark.sql.errors.QueryExecutionErrors$.failToRecognizePatternAfterUpgradeError(QueryExecutionErrors.scala:1054) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:187) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:180) at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:172) at org.apache.spark.sql.catalyst.util.TimestampFormatter$.getFormatter(TimestampFormatter.scala:405) at org.apache.spark.sql.catalyst.util.TimestampFormatter$.apply(TimestampFormatter.scala:466) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter(datetimeExpressions.scala:94) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter$(datetimeExpressions.scala:88) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.getFormatter(datetimeExpressions.scala:1174) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.$anonfun$formatterOption$1(datetimeExpressions.scala:85) at scala.Option.map(Option.scala:230) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption(datetimeExpressions.scala:85) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption$(datetimeExpressions.scala:83) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption$lzycompute(datetimeExpressions.scala:1174) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption(datetimeExpressions.scala:1174) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.eval(datetimeExpressions.scala:1216) at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:168) at org.apache.spark.sql.catalyst.expressions.InterpretedUnsafeProjection.apply(InterpretedUnsafeProjection.scala:90) at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:260) at org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:154) at org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:30) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.hashAgg_doAggregateWithKeys_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$3(ShuffleMapTask.scala:81) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$1(ShuffleMapTask.scala:81) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.doRunTask(Task.scala:153) at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:122) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.scheduler.Task.run(Task.scala:93) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:824) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1641) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:827) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:683) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.IllegalArgumentException: Too many pattern letters: a at java.time.format.DateTimeFormatterBuilder.parseField(DateTimeFormatterBuilder.java:1774) at java.time.format.DateTimeFormatterBuilder.parsePattern(DateTimeFormatterBuilder.java:1613) at java.time.format.DateTimeFormatterBuilder.appendPattern(DateTimeFormatterBuilder.java:1581) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.buildFormatter(DateTimeFormatterHelper.scala:250) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter(DateTimeFormatterHelper.scala:126) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter$(DateTimeFormatterHelper.scala:117) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.getOrCreateFormatter(TimestampFormatter.scala:108) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter$lzycompute(TimestampFormatter.scala:117) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter(TimestampFormatter.scala:116) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:171) ... 43 more Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2979) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2926) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2920) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2920) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1340) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1340) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1340) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3188) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3129) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3117) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) Caused by: org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMM dd, yyyy hh:mm:ss aa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html at org.apache.spark.sql.errors.QueryExecutionErrors$.failToRecognizePatternAfterUpgradeError(QueryExecutionErrors.scala:1054) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:187) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:180) at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:172) at org.apache.spark.sql.catalyst.util.TimestampFormatter$.getFormatter(TimestampFormatter.scala:405) at org.apache.spark.sql.catalyst.util.TimestampFormatter$.apply(TimestampFormatter.scala:466) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter(datetimeExpressions.scala:94) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter$(datetimeExpressions.scala:88) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.getFormatter(datetimeExpressions.scala:1174) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.$anonfun$formatterOption$1(datetimeExpressions.scala:85) at scala.Option.map(Option.scala:230) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption(datetimeExpressions.scala:85) at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption$(datetimeExpressions.scala:83) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption$lzycompute(datetimeExpressions.scala:1174) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption(datetimeExpressions.scala:1174) at org.apache.spark.sql.catalyst.expressions.ToTimestamp.eval(datetimeExpressions.scala:1216) at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:168) at org.apache.spark.sql.catalyst.expressions.InterpretedUnsafeProjection.apply(InterpretedUnsafeProjection.scala:90) at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:260) at org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:154) at org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:30) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.hashAgg_doAggregateWithKeys_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$3(ShuffleMapTask.scala:81) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$1(ShuffleMapTask.scala:81) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.doRunTask(Task.scala:153) at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:122) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.scheduler.Task.run(Task.scala:93) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:824) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1641) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:827) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:683) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.IllegalArgumentException: Too many pattern letters: a at java.time.format.DateTimeFormatterBuilder.parseField(DateTimeFormatterBuilder.java:1774) at java.time.format.DateTimeFormatterBuilder.parsePattern(DateTimeFormatterBuilder.java:1613) at java.time.format.DateTimeFormatterBuilder.appendPattern(DateTimeFormatterBuilder.java:1581) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.buildFormatter(DateTimeFormatterHelper.scala:250) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter(DateTimeFormatterHelper.scala:126) at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter$(DateTimeFormatterHelper.scala:117) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.getOrCreateFormatter(TimestampFormatter.scala:108) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter$lzycompute(TimestampFormatter.scala:117) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter(TimestampFormatter.scala:116) at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:171) ... 43 more
--------------------------------------------------------------------- To unsubscribe e-mail: user-unsubscr...@spark.apache.org