[ https://issues.apache.org/jira/browse/SPARK-4130?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Joseph E. Gonzalez updated SPARK-4130: -------------------------------------- Description: When testing MLlib on the splice site data (http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#splice-site) the loadSVM. To reproduce in spark shell: ``` import org.apache.spark.mllib.util.MLUtils val data = MLUtils.loadLibSVMFile(sc, "hdfs://ec2-54-200-69-227.us-west-2.compute.amazonaws.com:9000/splice_site.t") ``` generates the error: ``` org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0:73 failed 4 times, most recent failure: Exception failure in TID 335 on host ip-172-31-31-54.us-west-2.compute.internal: java.lang.NumberFormatException: For input string: "" java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) java.lang.Integer.parseInt(Integer.java:504) java.lang.Integer.parseInt(Integer.java:527) scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229) scala.collection.immutable.StringOps.toInt(StringOps.scala:31) org.apache.spark.mllib.util.MLUtils$$anonfun$4$$anonfun$5.apply(MLUtils.scala:81) org.apache.spark.mllib.util.MLUtils$$anonfun$4$$anonfun$5.apply(MLUtils.scala:79) scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108) scala.collection.TraversableLike$class.map(TraversableLike.scala:244) scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108) org.apache.spark.mllib.util.MLUtils$$anonfun$4.apply(MLUtils.scala:79) org.apache.spark.mllib.util.MLUtils$$anonfun$4.apply(MLUtils.scala:76) scala.collection.Iterator$$anon$11.next(Iterator.scala:328) scala.collection.Iterator$class.foreach(Iterator.scala:727) scala.collection.AbstractIterator.foreach(Iterator.scala:1157) scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48) scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103) org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:107) org.apache.spark.rdd.RDD.iterator(RDD.scala:227) org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31) org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262) org.apache.spark.rdd.RDD.iterator(RDD.scala:229) org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111) org.apache.spark.scheduler.Task.run(Task.scala:51) org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187) java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) java.lang.Thread.run(Thread.java:745) ``` was: When testing MLlib on the splice site data (http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#splice-site) the loadSVM. To reproduce in spark shell: import org.apache.spark.mllib.util.MLUtils val data = MLUtils.loadLibSVMFile(sc, "hdfs://ec2-54-200-69-227.us-west-2.compute.amazonaws.com:9000/splice_site.t") generates the error: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0:73 failed 4 times, most recent failure: Exception failure in TID 335 on host ip-172-31-31-54.us-west-2.compute.internal: java.lang.NumberFormatException: For input string: "" java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) java.lang.Integer.parseInt(Integer.java:504) java.lang.Integer.parseInt(Integer.java:527) scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229) scala.collection.immutable.StringOps.toInt(StringOps.scala:31) org.apache.spark.mllib.util.MLUtils$$anonfun$4$$anonfun$5.apply(MLUtils.scala:81) org.apache.spark.mllib.util.MLUtils$$anonfun$4$$anonfun$5.apply(MLUtils.scala:79) scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108) scala.collection.TraversableLike$class.map(TraversableLike.scala:244) scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108) org.apache.spark.mllib.util.MLUtils$$anonfun$4.apply(MLUtils.scala:79) org.apache.spark.mllib.util.MLUtils$$anonfun$4.apply(MLUtils.scala:76) scala.collection.Iterator$$anon$11.next(Iterator.scala:328) scala.collection.Iterator$class.foreach(Iterator.scala:727) scala.collection.AbstractIterator.foreach(Iterator.scala:1157) scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48) scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103) org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:107) org.apache.spark.rdd.RDD.iterator(RDD.scala:227) org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31) org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262) org.apache.spark.rdd.RDD.iterator(RDD.scala:229) org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111) org.apache.spark.scheduler.Task.run(Task.scala:51) org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187) java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) java.lang.Thread.run(Thread.java:745) > loadLibSVMFile does not handle extra whitespace > ----------------------------------------------- > > Key: SPARK-4130 > URL: https://issues.apache.org/jira/browse/SPARK-4130 > Project: Spark > Issue Type: Bug > Components: MLlib > Reporter: Joseph E. Gonzalez > > When testing MLlib on the splice site data > (http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#splice-site) > the loadSVM. To reproduce in spark shell: > ``` > import org.apache.spark.mllib.util.MLUtils > val data = MLUtils.loadLibSVMFile(sc, > "hdfs://ec2-54-200-69-227.us-west-2.compute.amazonaws.com:9000/splice_site.t") > ``` > generates the error: > ``` > org.apache.spark.SparkException: Job aborted due to stage failure: Task > 0.0:73 failed 4 times, most recent failure: Exception failure in TID 335 on > host ip-172-31-31-54.us-west-2.compute.internal: > java.lang.NumberFormatException: For input string: "" > > java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) > java.lang.Integer.parseInt(Integer.java:504) > java.lang.Integer.parseInt(Integer.java:527) > > scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229) > scala.collection.immutable.StringOps.toInt(StringOps.scala:31) > > org.apache.spark.mllib.util.MLUtils$$anonfun$4$$anonfun$5.apply(MLUtils.scala:81) > > org.apache.spark.mllib.util.MLUtils$$anonfun$4$$anonfun$5.apply(MLUtils.scala:79) > > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > > scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) > scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108) > scala.collection.TraversableLike$class.map(TraversableLike.scala:244) > scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108) > org.apache.spark.mllib.util.MLUtils$$anonfun$4.apply(MLUtils.scala:79) > org.apache.spark.mllib.util.MLUtils$$anonfun$4.apply(MLUtils.scala:76) > scala.collection.Iterator$$anon$11.next(Iterator.scala:328) > scala.collection.Iterator$class.foreach(Iterator.scala:727) > scala.collection.AbstractIterator.foreach(Iterator.scala:1157) > > scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48) > > scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103) > org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:107) > org.apache.spark.rdd.RDD.iterator(RDD.scala:227) > org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31) > org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262) > org.apache.spark.rdd.RDD.iterator(RDD.scala:229) > org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111) > org.apache.spark.scheduler.Task.run(Task.scala:51) > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187) > > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > java.lang.Thread.run(Thread.java:745) > ``` -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org