Hi,

Why am I getting this error which prevents my KMeans clustering algorithm to 
work inside of Spark? I'm trying to run a sample Scala model found in 
Databricks website on my Cloudera-Spark 1-node local VM. For completeness, the 
Scala program is as follows: Thx

import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors

// Load and parse the data
val data = sc.textFile("/path/to/file")
  .map(s => Vectors.dense(s.split(',').map(_.toDouble)))

// Cluster the data into three classes using KMeans
val numIterations = 20
val numClusters = 3
val kmeansModel = KMeans.train(data, numClusters, numIterations)


5/09/23 19:38:11 WARN clustering.KMeans: The input data is not directly cached, 
which may hurt performance if its parent RDDs are also uncached.
java.io.IOException: No FileSystem for scheme: c
               at 
org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2584)
               at 
org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2591)
               at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91)
               at 
org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2630)
               at 
org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2612)
               at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370)
               at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296)
               at 
org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:256)
               at 
org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228)
               at 
org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313)
               at 
org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:203)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
               at scala.Option.getOrElse(Option.scala:120)
               at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
               at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
               at scala.Option.getOrElse(Option.scala:120)
               at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
               at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
               at scala.Option.getOrElse(Option.scala:120)
               at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
               at 
org.apache.spark.rdd.ZippedPartitionsBaseRDD.getPartitions(ZippedPartitionsRDD.scala:55)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
               at scala.Option.getOrElse(Option.scala:120)
               at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
               at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
               at 
org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
               at scala.Option.getOrElse(Option.scala:120)
               at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
               at org.apache.spark.SparkContext.runJob(SparkContext.scala:1517)
               at org.apache.spark.rdd.RDD.count(RDD.scala:1006)
               at org.apache.spark.rdd.RDD.takeSample(RDD.scala:428)
               at 
org.apache.spark.mllib.clustering.KMeans.initKMeansParallel(KMeans.scala:288)
               at 
org.apache.spark.mllib.clustering.KMeans.runAlgorithm(KMeans.scala:162)
               at org.apache.spark.mllib.clustering.KMeans.run(KMeans.scala:139)
               at 
org.apache.spark.mllib.clustering.KMeans$.train(KMeans.scala:420)
               at 
org.apache.spark.mllib.clustering.KMeans$.train(KMeans.scala:430)
               at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:29)
               at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:34)
               at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:36)
               at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:38)
               at $iwC$$iwC$$iwC$$iwC.<init>(<console>:40)
               at $iwC$$iwC$$iwC.<init>(<console>:42)
               at $iwC$$iwC.<init>(<console>:44)
               at $iwC.<init>(<console>:46)
               at <init>(<console>:48)
               at .<init>(<console>:52)
               at .<clinit>(<console>)
               at .<init>(<console>:7)
               at .<clinit>(<console>)
               at $print(<console>)
               at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
               at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
               at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
               at java.lang.reflect.Method.invoke(Method.java:606)
               at 
org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
               at 
org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1338)
               at 
org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
               at 
org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
               at 
org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
               at 
org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:856)
               at 
org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:901)
               at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:813)
               at 
org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:656)
               at 
org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:664)
               at 
org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:669)
               at 
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:996)
               at 
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:944)
               at 
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:944)
               at 
scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
               at 
org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:944)
               at 
org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1058)
               at org.apache.spark.repl.Main$.main(Main.scala:31)
               at org.apache.spark.repl.Main.main(Main.scala)
               at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
               at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
               at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
               at java.lang.reflect.Method.invoke(Method.java:606)
               at 
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:569)
               at 
org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:166)
               at 
org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:189)
               at 
org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:110)
               at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)

Reply via email to