[jira] [Commented] (SPARK-43389) spark.read.csv throws NullPointerException when lineSep is set to None
[ https://issues.apache.org/jira/browse/SPARK-43389?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17771698#comment-17771698 ] Gera Shegalov commented on SPARK-43389: --- There is a symmetrical issue on the DataFrameWriter side: {code:python} >>> spark.createDataFrame([('some value',),]).write.option('someOpt', >>> None).saveAsTable("hive_csv_t21") {code} {code:java} 23/10/03 21:39:12 WARN HiveExternalCatalog: Could not persist `spark_catalog`.`default`.`hive_csv_t21` in a Hive compatible way. Persisting it into Hive metastore in Spark SQL specific format. org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:java.lang.NullPointerException: Null values not allowed in persistent maps.) at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:869) at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:874) at org.apache.spark.sql.hive.client.Shim_v0_12.createTable(HiveShim.scala:614) at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$createTable$1(HiveClientImpl.scala:573) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$withHiveState$1(HiveClientImpl.scala:303) at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:234) at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:233) at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:283) at org.apache.spark.sql.hive.client.HiveClientImpl.createTable(HiveClientImpl.scala:571) at org.apache.spark.sql.hive.HiveExternalCatalog.saveTableIntoHive(HiveExternalCatalog.scala:526) at org.apache.spark.sql.hive.HiveExternalCatalog.createDataSourceTable(HiveExternalCatalog.scala:415) at org.apache.spark.sql.hive.HiveExternalCatalog.$anonfun$createTable$1(HiveExternalCatalog.scala:274) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:99) at org.apache.spark.sql.hive.HiveExternalCatalog.createTable(HiveExternalCatalog.scala:245) at org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.createTable(ExternalCatalogWithListener.scala:94) at org.apache.spark.sql.catalyst.catalog.SessionCatalog.createTable(SessionCatalog.scala:402) at org.apache.spark.sql.rapids.shims.GpuCreateDataSourceTableAsSelectCommand.run(GpuCreateDataSourceTableAsSelectCommandShims.scala:91) at com.nvidia.spark.rapids.GpuExecutedCommandExec.sideEffectResult$lzycompute(GpuExecutedCommandExec.scala:52) at com.nvidia.spark.rapids.GpuExecutedCommandExec.sideEffectResult(GpuExecutedCommandExec.scala:50) at com.nvidia.spark.rapids.GpuExecutedCommandExec.executeCollect(GpuExecutedCommandExec.scala:61) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437) at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98) at
[jira] [Commented] (SPARK-43389) spark.read.csv throws NullPointerException when lineSep is set to None
[ https://issues.apache.org/jira/browse/SPARK-43389?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17741360#comment-17741360 ] ASF GitHub Bot commented on SPARK-43389: User 'gdhuper' has created a pull request for this issue: https://github.com/apache/spark/pull/41904 > spark.read.csv throws NullPointerException when lineSep is set to None > -- > > Key: SPARK-43389 > URL: https://issues.apache.org/jira/browse/SPARK-43389 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.3.1 >Reporter: Zach Liu >Priority: Trivial > > lineSep was defined as Optional[str] yet i'm unable to explicitly set it as > None: > reader = spark.read.format("csv") > read_options={'inferSchema': False, 'header': True, 'mode': 'DROPMALFORMED', > 'sep': '\t', 'escape': '\\', 'multiLine': False, 'lineSep': None} > for option, option_value in read_options.items(): > reader = reader.option(option, option_value) > df = reader.load("s3://") > raises exception: > py4j.protocol.Py4JJavaError: An error occurred while calling o126.load. > : java.lang.NullPointerException > at > scala.collection.immutable.StringOps$.length$extension(StringOps.scala:51) > at scala.collection.immutable.StringOps.length(StringOps.scala:51) > at > scala.collection.IndexedSeqOptimized.isEmpty(IndexedSeqOptimized.scala:30) > at > scala.collection.IndexedSeqOptimized.isEmpty$(IndexedSeqOptimized.scala:30) > at scala.collection.immutable.StringOps.isEmpty(StringOps.scala:33) > at scala.collection.TraversableOnce.nonEmpty(TraversableOnce.scala:143) > at scala.collection.TraversableOnce.nonEmpty$(TraversableOnce.scala:143) > at scala.collection.immutable.StringOps.nonEmpty(StringOps.scala:33) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:216) > at scala.Option.map(Option.scala:230) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:215) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:47) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:60) > at > org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:210) > at scala.Option.orElse(Option.scala:447) > at > org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:207) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:411) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:185) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at > py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) > at py4j.ClientServerConnection.run(ClientServerConnection.java:106) > at java.lang.Thread.run(Thread.java:750) -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-43389) spark.read.csv throws NullPointerException when lineSep is set to None
[ https://issues.apache.org/jira/browse/SPARK-43389?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17724299#comment-17724299 ] Sean R. Owen commented on SPARK-43389: -- Looks like we should handle the case where it's explicitly set to null in CSVOptions:256. Just use a match statement to handle null and None the same way - return None. Are you up for making a PR? > spark.read.csv throws NullPointerException when lineSep is set to None > -- > > Key: SPARK-43389 > URL: https://issues.apache.org/jira/browse/SPARK-43389 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.3.1 >Reporter: Zach Liu >Priority: Trivial > > lineSep was defined as Optional[str] yet i'm unable to explicitly set it as > None: > reader = spark.read.format("csv") > read_options={'inferSchema': False, 'header': True, 'mode': 'DROPMALFORMED', > 'sep': '\t', 'escape': '\\', 'multiLine': False, 'lineSep': None} > for option, option_value in read_options.items(): > reader = reader.option(option, option_value) > df = reader.load("s3://") > raises exception: > py4j.protocol.Py4JJavaError: An error occurred while calling o126.load. > : java.lang.NullPointerException > at > scala.collection.immutable.StringOps$.length$extension(StringOps.scala:51) > at scala.collection.immutable.StringOps.length(StringOps.scala:51) > at > scala.collection.IndexedSeqOptimized.isEmpty(IndexedSeqOptimized.scala:30) > at > scala.collection.IndexedSeqOptimized.isEmpty$(IndexedSeqOptimized.scala:30) > at scala.collection.immutable.StringOps.isEmpty(StringOps.scala:33) > at scala.collection.TraversableOnce.nonEmpty(TraversableOnce.scala:143) > at scala.collection.TraversableOnce.nonEmpty$(TraversableOnce.scala:143) > at scala.collection.immutable.StringOps.nonEmpty(StringOps.scala:33) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:216) > at scala.Option.map(Option.scala:230) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:215) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:47) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:60) > at > org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:210) > at scala.Option.orElse(Option.scala:447) > at > org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:207) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:411) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:185) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at > py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) > at py4j.ClientServerConnection.run(ClientServerConnection.java:106) > at java.lang.Thread.run(Thread.java:750) -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-43389) spark.read.csv throws NullPointerException when lineSep is set to None
[ https://issues.apache.org/jira/browse/SPARK-43389?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17720568#comment-17720568 ] Zach Liu commented on SPARK-43389: -- that's why i set the type as "Improvement" and the priority as "Trivial". but still, it's an unnecessary confusion > spark.read.csv throws NullPointerException when lineSep is set to None > -- > > Key: SPARK-43389 > URL: https://issues.apache.org/jira/browse/SPARK-43389 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.3.1 >Reporter: Zach Liu >Priority: Trivial > > lineSep was defined as Optional[str] yet i'm unable to explicitly set it as > None: > reader = spark.read.format("csv") > read_options={'inferSchema': False, 'header': True, 'mode': 'DROPMALFORMED', > 'sep': '\t', 'escape': '\\', 'multiLine': False, 'lineSep': None} > for option, option_value in read_options.items(): > reader = reader.option(option, option_value) > df = reader.load("s3://") > raises exception: > py4j.protocol.Py4JJavaError: An error occurred while calling o126.load. > : java.lang.NullPointerException > at > scala.collection.immutable.StringOps$.length$extension(StringOps.scala:51) > at scala.collection.immutable.StringOps.length(StringOps.scala:51) > at > scala.collection.IndexedSeqOptimized.isEmpty(IndexedSeqOptimized.scala:30) > at > scala.collection.IndexedSeqOptimized.isEmpty$(IndexedSeqOptimized.scala:30) > at scala.collection.immutable.StringOps.isEmpty(StringOps.scala:33) > at scala.collection.TraversableOnce.nonEmpty(TraversableOnce.scala:143) > at scala.collection.TraversableOnce.nonEmpty$(TraversableOnce.scala:143) > at scala.collection.immutable.StringOps.nonEmpty(StringOps.scala:33) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:216) > at scala.Option.map(Option.scala:230) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:215) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:47) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:60) > at > org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:210) > at scala.Option.orElse(Option.scala:447) > at > org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:207) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:411) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:185) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at > py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) > at py4j.ClientServerConnection.run(ClientServerConnection.java:106) > at java.lang.Thread.run(Thread.java:750) -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-43389) spark.read.csv throws NullPointerException when lineSep is set to None
[ https://issues.apache.org/jira/browse/SPARK-43389?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17720371#comment-17720371 ] Hyukjin Kwon commented on SPARK-43389: -- [~zach liu] you can simply just don't specify instead of setting None. > spark.read.csv throws NullPointerException when lineSep is set to None > -- > > Key: SPARK-43389 > URL: https://issues.apache.org/jira/browse/SPARK-43389 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.3.1 >Reporter: Zach Liu >Priority: Trivial > > lineSep was defined as Optional[str] yet i'm unable to explicitly set it as > None: > reader = spark.read.format("csv") > read_options={'inferSchema': False, 'header': True, 'mode': 'DROPMALFORMED', > 'sep': '\t', 'escape': '\\', 'multiLine': False, 'lineSep': None} > for option, option_value in read_options.items(): > reader = reader.option(option, option_value) > df = reader.load("s3://") > raises exception: > py4j.protocol.Py4JJavaError: An error occurred while calling o126.load. > : java.lang.NullPointerException > at > scala.collection.immutable.StringOps$.length$extension(StringOps.scala:51) > at scala.collection.immutable.StringOps.length(StringOps.scala:51) > at > scala.collection.IndexedSeqOptimized.isEmpty(IndexedSeqOptimized.scala:30) > at > scala.collection.IndexedSeqOptimized.isEmpty$(IndexedSeqOptimized.scala:30) > at scala.collection.immutable.StringOps.isEmpty(StringOps.scala:33) > at scala.collection.TraversableOnce.nonEmpty(TraversableOnce.scala:143) > at scala.collection.TraversableOnce.nonEmpty$(TraversableOnce.scala:143) > at scala.collection.immutable.StringOps.nonEmpty(StringOps.scala:33) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:216) > at scala.Option.map(Option.scala:230) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:215) > at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:47) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:60) > at > org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:210) > at scala.Option.orElse(Option.scala:447) > at > org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:207) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:411) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:185) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at > py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) > at py4j.ClientServerConnection.run(ClientServerConnection.java:106) > at java.lang.Thread.run(Thread.java:750) -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org