[ https://issues.apache.org/jira/browse/SPARK-23814?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
bharath kumar avusherla updated SPARK-23814: -------------------------------------------- Description: When the file name has colon and new line character in data, while reading using spark.read.option("multiLine","true").csv("s3n://DirectoryPath/") function. It is throwing *"**java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz"* error. If we remove the option("multiLine","true"), it is working just fine though the file name has colon in it. It is working fine, If i apply this option *option("multiLine","true")* on any other file which doesn't have colon in it. But when both are present (colon in file name and new line in the data), it's not working. {quote}java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz at org.apache.hadoop.fs.Path.initialize(Path.java:205) at org.apache.hadoop.fs.Path.<init>(Path.java:171) at org.apache.hadoop.fs.Path.<init>(Path.java:93) at org.apache.hadoop.fs.Globber.glob(Globber.java:253) at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:1676) at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:294) at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265) at org.apache.spark.input.StreamFileInputFormat.setMinPartitions(PortableDataStream.scala:51) at org.apache.spark.rdd.BinaryFileRDD.getPartitions(BinaryFileRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) at org.apache.spark.rdd.RDD.take(RDD.scala:1327) at org.apache.spark.sql.execution.datasources.csv.MultiLineCSVDataSource$.infer(CSVDataSource.scala:224) at org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:62) at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:57) at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177) at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177) at scala.Option.orElse(Option.scala:289) at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:176) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178) at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533) at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412) ... 48 elided Caused by: java.net.URISyntaxException: Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz at java.net.URI.checkPath(URI.java:1823) at java.net.URI.<init>(URI.java:745) at org.apache.hadoop.fs.Path.initialize(Path.java:202) ... 86 more {quote} was: When the file name has colon and new line character in data, while reading using spark.read.option("multiLine","true").csv("s3n://Directory/") function. It is throwing *"**java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz"* error. If we remove the option("multiLine","true"), it is working just fine though the file name has colon in it. It is working fine, If i apply this option *option("multiLine","true")* on any other file which doesn't have colon in it. But when both are present (colon in file name and new line in the data), it's not working. {quote}java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz at org.apache.hadoop.fs.Path.initialize(Path.java:205) at org.apache.hadoop.fs.Path.<init>(Path.java:171) at org.apache.hadoop.fs.Path.<init>(Path.java:93) at org.apache.hadoop.fs.Globber.glob(Globber.java:253) at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:1676) at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:294) at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265) at org.apache.spark.input.StreamFileInputFormat.setMinPartitions(PortableDataStream.scala:51) at org.apache.spark.rdd.BinaryFileRDD.getPartitions(BinaryFileRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) at org.apache.spark.rdd.RDD.take(RDD.scala:1327) at org.apache.spark.sql.execution.datasources.csv.MultiLineCSVDataSource$.infer(CSVDataSource.scala:224) at org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:62) at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:57) at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177) at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177) at scala.Option.orElse(Option.scala:289) at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:176) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178) at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533) at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412) ... 48 elided Caused by: java.net.URISyntaxException: Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz at java.net.URI.checkPath(URI.java:1823) at java.net.URI.<init>(URI.java:745) at org.apache.hadoop.fs.Path.initialize(Path.java:202) ... 86 more {quote} > Couldn't read file with colon in name and new line character in one of the > field. > --------------------------------------------------------------------------------- > > Key: SPARK-23814 > URL: https://issues.apache.org/jira/browse/SPARK-23814 > Project: Spark > Issue Type: Bug > Components: Spark Core, Spark Shell > Affects Versions: 2.2.0 > Reporter: bharath kumar avusherla > Priority: Major > > When the file name has colon and new line character in data, while reading > using spark.read.option("multiLine","true").csv("s3n://DirectoryPath/") > function. It is throwing *"**java.lang.IllegalArgumentException: > java.net.URISyntaxException: Relative path in absolute URI: > 2017-08-01T00:00:00Z.csv.gz"* error. If we remove the > option("multiLine","true"), it is working just fine though the file name has > colon in it. It is working fine, If i apply this option > *option("multiLine","true")* on any other file which doesn't have colon in > it. But when both are present (colon in file name and new line in the data), > it's not working. > {quote}java.lang.IllegalArgumentException: java.net.URISyntaxException: > Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz > at org.apache.hadoop.fs.Path.initialize(Path.java:205) > at org.apache.hadoop.fs.Path.<init>(Path.java:171) > at org.apache.hadoop.fs.Path.<init>(Path.java:93) > at org.apache.hadoop.fs.Globber.glob(Globber.java:253) > at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:1676) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:294) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265) > at > org.apache.spark.input.StreamFileInputFormat.setMinPartitions(PortableDataStream.scala:51) > at org.apache.spark.rdd.BinaryFileRDD.getPartitions(BinaryFileRDD.scala:46) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) > at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.take(RDD.scala:1327) > at > org.apache.spark.sql.execution.datasources.csv.MultiLineCSVDataSource$.infer(CSVDataSource.scala:224) > at > org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:62) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:57) > at > org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177) > at > org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177) > at scala.Option.orElse(Option.scala:289) > at > org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:176) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178) > at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533) > at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412) > ... 48 elided > Caused by: java.net.URISyntaxException: Relative path in absolute URI: > 2017-08-01T00:00:00Z.csv.gz > at java.net.URI.checkPath(URI.java:1823) > at java.net.URI.<init>(URI.java:745) > at org.apache.hadoop.fs.Path.initialize(Path.java:202) > ... 86 more > {quote} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org