[ 
https://issues.apache.org/jira/browse/SPARK-13846?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Dmitry Spikhalskiy updated SPARK-13846:
---------------------------------------
    Description: 
I got the exception and looks like it's related to unknown categorical variable 
value passed to indexing.

java.util.NoSuchElementException: key not found: 20.0
        at scala.collection.MapLike$class.default(MapLike.scala:228)
        at scala.collection.AbstractMap.default(Map.scala:58)
        at scala.collection.MapLike$class.apply(MapLike.scala:141)
        at scala.collection.AbstractMap.apply(Map.scala:58)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10$$anonfun$apply$4.apply(VectorIndexer.scala:316)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10$$anonfun$apply$4.apply(VectorIndexer.scala:315)
        at 
scala.collection.immutable.HashMap$HashMap1.foreach(HashMap.scala:224)
        at 
scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:403)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10.apply(VectorIndexer.scala:315)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10.apply(VectorIndexer.scala:309)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$11.apply(VectorIndexer.scala:351)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$11.apply(VectorIndexer.scala:351)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.evalExpr2$(Unknown
 Source)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
 Source)
        at 
org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:51)
        at 
org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:49)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at 
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
        at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
        at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
        at org.apache.spark.scheduler.Task.run(Task.scala:89)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

VectorIndexer created like
    val featureIndexer = new VectorIndexer()
        .setInputCol(DataFrameColumns.FEATURES)
        .setOutputCol("indexedFeatures")
        .setMaxCategories(5)
        .fit(trainingDF)

Output should be not just default java.util.NoSuchElementException, but 
something specific like UnknownCategoricalValue with information, that could 
help to find the source element of vector (element index in vector maybe).

  was:
I got an exception and looks like it's related to unknown categorical variable 
value passed indexing.

java.util.NoSuchElementException: key not found: 20.0
        at scala.collection.MapLike$class.default(MapLike.scala:228)
        at scala.collection.AbstractMap.default(Map.scala:58)
        at scala.collection.MapLike$class.apply(MapLike.scala:141)
        at scala.collection.AbstractMap.apply(Map.scala:58)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10$$anonfun$apply$4.apply(VectorIndexer.scala:316)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10$$anonfun$apply$4.apply(VectorIndexer.scala:315)
        at 
scala.collection.immutable.HashMap$HashMap1.foreach(HashMap.scala:224)
        at 
scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:403)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10.apply(VectorIndexer.scala:315)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10.apply(VectorIndexer.scala:309)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$11.apply(VectorIndexer.scala:351)
        at 
org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$11.apply(VectorIndexer.scala:351)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.evalExpr2$(Unknown
 Source)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
 Source)
        at 
org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:51)
        at 
org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:49)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at 
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
        at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
        at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
        at org.apache.spark.scheduler.Task.run(Task.scala:89)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

VectorIndexer created like
    val featureIndexer = new VectorIndexer()
        .setInputCol(DataFrameColumns.FEATURES)
        .setOutputCol("indexedFeatures")
        .setMaxCategories(5)
        .fit(trainingDF)

Output should be not just default java.util.NoSuchElementException, but 
something specific like UnknownCategoricalValue with information, that could 
help to find the source element of vector (element index in vector maybe).


> VectorIndexer output on unknown feature should be more descriptive
> ------------------------------------------------------------------
>
>                 Key: SPARK-13846
>                 URL: https://issues.apache.org/jira/browse/SPARK-13846
>             Project: Spark
>          Issue Type: Bug
>          Components: ML
>    Affects Versions: 1.6.1
>            Reporter: Dmitry Spikhalskiy
>            Priority: Minor
>
> I got the exception and looks like it's related to unknown categorical 
> variable value passed to indexing.
> java.util.NoSuchElementException: key not found: 20.0
>         at scala.collection.MapLike$class.default(MapLike.scala:228)
>         at scala.collection.AbstractMap.default(Map.scala:58)
>         at scala.collection.MapLike$class.apply(MapLike.scala:141)
>         at scala.collection.AbstractMap.apply(Map.scala:58)
>         at 
> org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10$$anonfun$apply$4.apply(VectorIndexer.scala:316)
>         at 
> org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10$$anonfun$apply$4.apply(VectorIndexer.scala:315)
>         at 
> scala.collection.immutable.HashMap$HashMap1.foreach(HashMap.scala:224)
>         at 
> scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:403)
>         at 
> org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10.apply(VectorIndexer.scala:315)
>         at 
> org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$10.apply(VectorIndexer.scala:309)
>         at 
> org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$11.apply(VectorIndexer.scala:351)
>         at 
> org.apache.spark.ml.feature.VectorIndexerModel$$anonfun$11.apply(VectorIndexer.scala:351)
>         at 
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.evalExpr2$(Unknown
>  Source)
>         at 
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown
>  Source)
>         at 
> org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:51)
>         at 
> org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:49)
>         at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>         at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>         at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>         at 
> org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
>         at 
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
>         at 
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
>         at org.apache.spark.scheduler.Task.run(Task.scala:89)
>         at 
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
>         at java.lang.Thread.run(Thread.java:745)
> VectorIndexer created like
>     val featureIndexer = new VectorIndexer()
>         .setInputCol(DataFrameColumns.FEATURES)
>         .setOutputCol("indexedFeatures")
>         .setMaxCategories(5)
>         .fit(trainingDF)
> Output should be not just default java.util.NoSuchElementException, but 
> something specific like UnknownCategoricalValue with information, that could 
> help to find the source element of vector (element index in vector maybe).



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to