[jira] [Commented] (SPARK-19521) Error with embedded line break (multi-line record) in csv file.
[ https://issues.apache.org/jira/browse/SPARK-19521?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15858632#comment-15858632 ] Ruslan Korniichuk commented on SPARK-19521: --- df = sql_context.read.csv( path=src, sep=',', encoding="UTF-8", quote='"', escape='"', header=True, inferSchema=True, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=False, mode=parser_mode) > Error with embedded line break (multi-line record) in csv file. > --- > > Key: SPARK-19521 > URL: https://issues.apache.org/jira/browse/SPARK-19521 > Project: Spark > Issue Type: Bug > Components: PySpark >Affects Versions: 2.1.0 > Environment: $ uname -a > Linux jupyter-test 3.19.0-25-generic #26~14.04.1-Ubuntu SMP Fri Jul 24 > 21:16:20 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux >Reporter: Ruslan Korniichuk > > Input csv file: > id,name,amount,isActive,Remark > 1,Barney & Company,0,,"Great to work with > and always pays with cash." > Output json file with spark-2.0.2-bin-hadoop2.7: > {"id":"1","name":"Barney & Company","amount":0,"Remark":"Great to work with > \nand always pays with cash."} > Error with spark-2.1.0-bin-hadoop2.7: > java.lang.RuntimeException: Malformed line in FAILFAST mode: and always pays > with cash." > 17/02/08 22:53:02 ERROR Utils: Aborting task > java.lang.RuntimeException: Malformed line in FAILFAST mode: and always pays > with cash." > at > org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:106) > at > org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:94) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:167) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:166) > at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) > at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:102) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:243) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:190) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:188) > at > org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:193) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:128) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:99) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > 17/02/08 22:53:02 ERROR FileFormatWriter: Job job_20170208225302_0003 aborted. > 17/02/08 22:53:02 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 4) > org.apache.spark.SparkException: Task failed while writing rows > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:204) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonf
[jira] [Commented] (SPARK-19521) Error with embedded line break (multi-line record) in csv file.
[ https://issues.apache.org/jira/browse/SPARK-19521?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15858631#comment-15858631 ] Ruslan Korniichuk commented on SPARK-19521: --- :: def create_json(src, dst, logger=None): """Create json file. Input: src -- src csv file abs path, dst -- dst json file abs path, logger -- logger obj (default: None). """ # Create Spark SQL context sql_context = SQLContext(settings.sc) # Create data frame parser_mode = settings.cfg.get("spark", "spark_parser_mode") df = sql_context.read.csv( path=src, sep=',', encoding="UTF-8", quote='"', escape='"', header=True, inferSchema=True, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=False, mode=parser_mode) # Save data frame as json file save_mode = settings.cfg.get("spark", "spark_save_mode") try: df.write.json(dst, mode=save_mode) except BaseException as exception: if logger != None: msg = "_error_DataFrameError" logger.error(settings.messages[msg] % (src, exception)) return 1 else: if logger != None: msg = "_savedJSON" logger.info(settings.messages[msg] % (src, dst)) return 0 > Error with embedded line break (multi-line record) in csv file. > --- > > Key: SPARK-19521 > URL: https://issues.apache.org/jira/browse/SPARK-19521 > Project: Spark > Issue Type: Bug > Components: PySpark >Affects Versions: 2.1.0 > Environment: $ uname -a > Linux jupyter-test 3.19.0-25-generic #26~14.04.1-Ubuntu SMP Fri Jul 24 > 21:16:20 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux >Reporter: Ruslan Korniichuk > > Input csv file: > id,name,amount,isActive,Remark > 1,Barney & Company,0,,"Great to work with > and always pays with cash." > Output json file with spark-2.0.2-bin-hadoop2.7: > {"id":"1","name":"Barney & Company","amount":0,"Remark":"Great to work with > \nand always pays with cash."} > Error with spark-2.1.0-bin-hadoop2.7: > java.lang.RuntimeException: Malformed line in FAILFAST mode: and always pays > with cash." > 17/02/08 22:53:02 ERROR Utils: Aborting task > java.lang.RuntimeException: Malformed line in FAILFAST mode: and always pays > with cash." > at > org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:106) > at > org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:94) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:167) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:166) > at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) > at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:102) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:243) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:190) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:188) > at > org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:193) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:128) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:99) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) > at
[jira] [Commented] (SPARK-19521) Error with embedded line break (multi-line record) in csv file.
[ https://issues.apache.org/jira/browse/SPARK-19521?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15858628#comment-15858628 ] Ruslan Korniichuk commented on SPARK-19521: --- def create_json(src, dst, logger=None): """Create json file. Input: src -- src csv file abs path, dst -- dst json file abs path, logger -- logger obj (default: None). """ # Create Spark SQL context sql_context = SQLContext(settings.sc) # Create data frame parser_mode = settings.cfg.get("spark", "spark_parser_mode") df = sql_context.read.csv( path=src, sep=',', encoding="UTF-8", quote='"', escape='"', header=True, inferSchema=True, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=False, mode=parser_mode) # Save data frame as json file save_mode = settings.cfg.get("spark", "spark_save_mode") try: df.write.json(dst, mode=save_mode) except BaseException as exception: if logger != None: msg = "_error_DataFrameError" logger.error(settings.messages[msg] % (src, exception)) return 1 else: if logger != None: msg = "_savedJSON" logger.info(settings.messages[msg] % (src, dst)) return 0 > Error with embedded line break (multi-line record) in csv file. > --- > > Key: SPARK-19521 > URL: https://issues.apache.org/jira/browse/SPARK-19521 > Project: Spark > Issue Type: Bug > Components: PySpark >Affects Versions: 2.1.0 > Environment: $ uname -a > Linux jupyter-test 3.19.0-25-generic #26~14.04.1-Ubuntu SMP Fri Jul 24 > 21:16:20 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux >Reporter: Ruslan Korniichuk > > Input csv file: > id,name,amount,isActive,Remark > 1,Barney & Company,0,,"Great to work with > and always pays with cash." > Output json file with spark-2.0.2-bin-hadoop2.7: > {"id":"1","name":"Barney & Company","amount":0,"Remark":"Great to work with > \nand always pays with cash."} > Error with spark-2.1.0-bin-hadoop2.7: > java.lang.RuntimeException: Malformed line in FAILFAST mode: and always pays > with cash." > 17/02/08 22:53:02 ERROR Utils: Aborting task > java.lang.RuntimeException: Malformed line in FAILFAST mode: and always pays > with cash." > at > org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:106) > at > org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:94) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:167) > at > org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:166) > at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) > at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:102) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:243) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:190) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:188) > at > org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:193) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:129) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$3.apply(FileFormatWriter.scala:128) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:99) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) > at > j