[jira] [Updated] (SPARK-34529) spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" when parsing windows line feed (CR LF)
[ https://issues.apache.org/jira/browse/SPARK-34529?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Syedhamjath updated SPARK-34529: Attachment: image-2021-08-26-14-12-30-397.png > spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" > when parsing windows line feed (CR LF) > > > Key: SPARK-34529 > URL: https://issues.apache.org/jira/browse/SPARK-34529 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.0.3, 3.1.1, 3.2.0 >Reporter: Shanmugavel Kuttiyandi Chandrakasu >Priority: Minor > Attachments: TestData.csv, image-2021-08-26-14-04-47-464.png, > image-2021-08-26-14-06-41-055.png, image-2021-08-26-14-12-30-397.png > > > lineSep documentation says - > `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line > separator that should be used for parsing. Maximum length is 1 character. > Reference: > > [https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader] > When reading csv file using spark > src_df = (spark.read > .option("header", "true") > .option("multiLine","true") > .option("escape", "ǁ") > .option("lineSep","\r\n") > .schema(materialusetype_Schema) > .option("badRecordsPath","/fh_badfile") > .csv("/crlf.csv") > ) > Below is the stack trace: > java.lang.IllegalArgumentException: requirement failed: 'lineSep' can contain > only 1 character.java.lang.IllegalArgumentException: requirement failed: > 'lineSep' can contain only 1 character. at > scala.Predef$.require(Predef.scala:281) at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:209) > at scala.Option.map(Option.scala:230) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:207) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:58) at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:108) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:132) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:123) > at > org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:162) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:510) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:497) > at > org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:692) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192) at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:79) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:61) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:57) > at > org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:483) > at scala.Option.getOrElse(Option.scala:189) at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:483) > at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:427) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58) > at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3013) at > org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3004) at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728) at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841) at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198) > at org.a
[jira] [Updated] (SPARK-34529) spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" when parsing windows line feed (CR LF)
[ https://issues.apache.org/jira/browse/SPARK-34529?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Syedhamjath updated SPARK-34529: Attachment: TestData.csv > spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" > when parsing windows line feed (CR LF) > > > Key: SPARK-34529 > URL: https://issues.apache.org/jira/browse/SPARK-34529 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.0.3, 3.1.1, 3.2.0 >Reporter: Shanmugavel Kuttiyandi Chandrakasu >Priority: Minor > Attachments: TestData.csv, image-2021-08-26-14-04-47-464.png, > image-2021-08-26-14-06-41-055.png > > > lineSep documentation says - > `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line > separator that should be used for parsing. Maximum length is 1 character. > Reference: > > [https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader] > When reading csv file using spark > src_df = (spark.read > .option("header", "true") > .option("multiLine","true") > .option("escape", "ǁ") > .option("lineSep","\r\n") > .schema(materialusetype_Schema) > .option("badRecordsPath","/fh_badfile") > .csv("/crlf.csv") > ) > Below is the stack trace: > java.lang.IllegalArgumentException: requirement failed: 'lineSep' can contain > only 1 character.java.lang.IllegalArgumentException: requirement failed: > 'lineSep' can contain only 1 character. at > scala.Predef$.require(Predef.scala:281) at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:209) > at scala.Option.map(Option.scala:230) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:207) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:58) at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:108) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:132) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:123) > at > org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:162) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:510) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:497) > at > org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:692) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192) at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:79) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:61) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:57) > at > org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:483) > at scala.Option.getOrElse(Option.scala:189) at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:483) > at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:427) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58) > at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3013) at > org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3004) at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728) at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841) at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3726) a
[jira] [Updated] (SPARK-34529) spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" when parsing windows line feed (CR LF)
[ https://issues.apache.org/jira/browse/SPARK-34529?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Syedhamjath updated SPARK-34529: Attachment: image-2021-08-26-14-06-41-055.png > spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" > when parsing windows line feed (CR LF) > > > Key: SPARK-34529 > URL: https://issues.apache.org/jira/browse/SPARK-34529 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.0.3, 3.1.1, 3.2.0 >Reporter: Shanmugavel Kuttiyandi Chandrakasu >Priority: Minor > Attachments: image-2021-08-26-14-04-47-464.png, > image-2021-08-26-14-06-41-055.png > > > lineSep documentation says - > `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line > separator that should be used for parsing. Maximum length is 1 character. > Reference: > > [https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader] > When reading csv file using spark > src_df = (spark.read > .option("header", "true") > .option("multiLine","true") > .option("escape", "ǁ") > .option("lineSep","\r\n") > .schema(materialusetype_Schema) > .option("badRecordsPath","/fh_badfile") > .csv("/crlf.csv") > ) > Below is the stack trace: > java.lang.IllegalArgumentException: requirement failed: 'lineSep' can contain > only 1 character.java.lang.IllegalArgumentException: requirement failed: > 'lineSep' can contain only 1 character. at > scala.Predef$.require(Predef.scala:281) at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:209) > at scala.Option.map(Option.scala:230) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:207) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:58) at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:108) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:132) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:123) > at > org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:162) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:510) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:497) > at > org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:692) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192) at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:79) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:61) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:57) > at > org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:483) > at scala.Option.getOrElse(Option.scala:189) at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:483) > at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:427) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58) > at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3013) at > org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3004) at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728) at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841) at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:
[jira] [Updated] (SPARK-34529) spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" when parsing windows line feed (CR LF)
[ https://issues.apache.org/jira/browse/SPARK-34529?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Syedhamjath updated SPARK-34529: Attachment: image-2021-08-26-14-04-47-464.png > spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" > when parsing windows line feed (CR LF) > > > Key: SPARK-34529 > URL: https://issues.apache.org/jira/browse/SPARK-34529 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.0.3, 3.1.1, 3.2.0 >Reporter: Shanmugavel Kuttiyandi Chandrakasu >Priority: Minor > Attachments: image-2021-08-26-14-04-47-464.png > > > lineSep documentation says - > `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line > separator that should be used for parsing. Maximum length is 1 character. > Reference: > > [https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader] > When reading csv file using spark > src_df = (spark.read > .option("header", "true") > .option("multiLine","true") > .option("escape", "ǁ") > .option("lineSep","\r\n") > .schema(materialusetype_Schema) > .option("badRecordsPath","/fh_badfile") > .csv("/crlf.csv") > ) > Below is the stack trace: > java.lang.IllegalArgumentException: requirement failed: 'lineSep' can contain > only 1 character.java.lang.IllegalArgumentException: requirement failed: > 'lineSep' can contain only 1 character. at > scala.Predef$.require(Predef.scala:281) at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:209) > at scala.Option.map(Option.scala:230) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:207) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:58) at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:108) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:132) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:123) > at > org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:162) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:510) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:497) > at > org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:692) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192) at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:79) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:61) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:57) > at > org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:483) > at scala.Option.getOrElse(Option.scala:189) at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:483) > at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:427) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58) > at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3013) at > org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3004) at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728) at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841) at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3726) at > org.apache.spark.sql.Datas
[jira] [Updated] (SPARK-34529) spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" when parsing windows line feed (CR LF)
[ https://issues.apache.org/jira/browse/SPARK-34529?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Takeshi Yamamuro updated SPARK-34529: - Component/s: (was: Spark Core) SQL > spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" > when parsing windows line feed (CR LF) > > > Key: SPARK-34529 > URL: https://issues.apache.org/jira/browse/SPARK-34529 > Project: Spark > Issue Type: Improvement > Components: PySpark, SQL >Affects Versions: 3.2.0, 3.1.1, 3.0.3 >Reporter: Shanmugavel Kuttiyandi Chandrakasu >Priority: Minor > > lineSep documentation says - > `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line > separator that should be used for parsing. Maximum length is 1 character. > Reference: > > [https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader] > When reading csv file using spark > src_df = (spark.read > .option("header", "true") > .option("multiLine","true") > .option("escape", "ǁ") > .option("lineSep","\r\n") > .schema(materialusetype_Schema) > .option("badRecordsPath","/fh_badfile") > .csv("/crlf.csv") > ) > Below is the stack trace: > java.lang.IllegalArgumentException: requirement failed: 'lineSep' can contain > only 1 character.java.lang.IllegalArgumentException: requirement failed: > 'lineSep' can contain only 1 character. at > scala.Predef$.require(Predef.scala:281) at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:209) > at scala.Option.map(Option.scala:230) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:207) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:58) at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:108) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:132) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:123) > at > org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:162) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:510) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:497) > at > org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:692) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192) at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:79) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:61) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:57) > at > org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:483) > at scala.Option.getOrElse(Option.scala:189) at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:483) > at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:427) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58) > at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3013) at > org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3004) at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728) at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841) at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3726) at > org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3003)
[jira] [Updated] (SPARK-34529) spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" when parsing windows line feed (CR LF)
[ https://issues.apache.org/jira/browse/SPARK-34529?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Takeshi Yamamuro updated SPARK-34529: - Affects Version/s: (was: 3.0.1) 3.0.3 3.1.1 3.2.0 > spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" > when parsing windows line feed (CR LF) > > > Key: SPARK-34529 > URL: https://issues.apache.org/jira/browse/SPARK-34529 > Project: Spark > Issue Type: Improvement > Components: PySpark, Spark Core >Affects Versions: 3.2.0, 3.1.1, 3.0.3 >Reporter: Shanmugavel Kuttiyandi Chandrakasu >Priority: Minor > > lineSep documentation says - > `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line > separator that should be used for parsing. Maximum length is 1 character. > Reference: > > [https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader] > When reading csv file using spark > src_df = (spark.read > .option("header", "true") > .option("multiLine","true") > .option("escape", "ǁ") > .option("lineSep","\r\n") > .schema(materialusetype_Schema) > .option("badRecordsPath","/fh_badfile") > .csv("/crlf.csv") > ) > Below is the stack trace: > java.lang.IllegalArgumentException: requirement failed: 'lineSep' can contain > only 1 character.java.lang.IllegalArgumentException: requirement failed: > 'lineSep' can contain only 1 character. at > scala.Predef$.require(Predef.scala:281) at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:209) > at scala.Option.map(Option.scala:230) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:207) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:58) at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:108) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:132) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:123) > at > org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:162) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:510) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:497) > at > org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:692) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192) at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:79) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:61) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:57) > at > org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:483) > at scala.Option.getOrElse(Option.scala:189) at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:483) > at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:427) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58) > at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3013) at > org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3004) at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728) at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841) at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:372
[jira] [Updated] (SPARK-34529) spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" when parsing windows line feed (CR LF)
[ https://issues.apache.org/jira/browse/SPARK-34529?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Takeshi Yamamuro updated SPARK-34529: - Issue Type: Improvement (was: Bug) > spark.read.csv is throwing exception ,"lineSep' can contain only 1 character" > when parsing windows line feed (CR LF) > > > Key: SPARK-34529 > URL: https://issues.apache.org/jira/browse/SPARK-34529 > Project: Spark > Issue Type: Improvement > Components: PySpark, Spark Core >Affects Versions: 3.0.1 >Reporter: Shanmugavel Kuttiyandi Chandrakasu >Priority: Minor > > lineSep documentation says - > `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line > separator that should be used for parsing. Maximum length is 1 character. > Reference: > > [https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader] > When reading csv file using spark > src_df = (spark.read > .option("header", "true") > .option("multiLine","true") > .option("escape", "ǁ") > .option("lineSep","\r\n") > .schema(materialusetype_Schema) > .option("badRecordsPath","/fh_badfile") > .csv("/crlf.csv") > ) > Below is the stack trace: > java.lang.IllegalArgumentException: requirement failed: 'lineSep' can contain > only 1 character.java.lang.IllegalArgumentException: requirement failed: > 'lineSep' can contain only 1 character. at > scala.Predef$.require(Predef.scala:281) at > org.apache.spark.sql.catalyst.csv.CSVOptions.$anonfun$lineSeparator$1(CSVOptions.scala:209) > at scala.Option.map(Option.scala:230) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:207) at > org.apache.spark.sql.catalyst.csv.CSVOptions.(CSVOptions.scala:58) at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:108) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:132) > at > org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:123) > at > org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:162) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:510) > at > org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:497) > at > org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:692) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192) at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:79) > at > org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:61) > at > org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:57) > at > org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:483) > at scala.Option.getOrElse(Option.scala:189) at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:483) > at > org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:427) > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58) > at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3013) at > org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3004) at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728) at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841) at > org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3726) at > org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3003) -- This message was sent b