Hi Jeff Sorry I did not respond sooner. I was out of town
Here is the code I use to initialize the HiveContext # load data set from pyspark.sql import HiveContext #,SQLContext, Row # window functions require HiveContext (spark 2.x will not require hive) #sqlContext = SQLContext(sc) hiveSqlContext = HiveContext(sc) Here is the complete stack trace. Could the problem be due to the size of numDimensions? numDimensions = 713912692155621377 The indices are sorted, not sure why this exception is raised p/pyspark/mllib/linalg/__init__.py", line 531, in __init__ raise TypeError("indices array must be sorted") TypeError: indices array must be sorted import numpy as np from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg import VectorUDT #sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) # = 3 = size # [0,1] int indices #[1.0, 3.0] values """ root |-- id: string (nullable = true) |-- follows: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- id: long (nullable = false) | | |-- screenName: string (nullable = false) """ def toSparseVector(pojoList) : indices = [] for pojo in pojoList : indices.append(pojo.id) sortedIndices = sorted(indices) logical = np.ones(len(sortedIndices)) vec = Vectors.sparse(numDimensions, sortedIndices, logical) return vec #myUDF = udf(lambda pojoList: labelStr if (labelStr == "noise") else "injury", StringType()) newColName = "features" myUDF = udf(toSparseVector, VectorUDT()) featuresDF = df.withColumn(newColName, myUDF(df["follows"])) In [16]: featuresDF.printSchema() featuresDF.show() root |-- id: string (nullable = true) |-- follows: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- id: long (nullable = false) | | |-- screenName: string (nullable = false) |-- features: vector (nullable = true) --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-16-6f7c439ddd93> in <module>() 1 featuresDF.printSchema() ----> 2 featuresDF.show() /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/datafr ame.py in show(self, n, truncate) 255 +---+-----+ 256 """ --> 257 print(self._jdf.showString(n, truncate)) 258 259 def __repr__(self): /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.z ip/py4j/java_gateway.py in __call__(self, *args) 811 answer = self.gateway_client.send_command(command) 812 return_value = get_return_value( --> 813 answer, self.gateway_client, self.target_id, self.name) 814 815 for temp_arg in temp_args: /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils. py in deco(*a, **kw) 43 def deco(*a, **kw): 44 try: ---> 45 return f(*a, **kw) 46 except py4j.protocol.Py4JJavaError as e: 47 s = e.java_exception.toString() /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.z ip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 306 raise Py4JJavaError( 307 "An error occurred while calling {0}{1}{2}.\n". --> 308 format(target_id, ".", name), value) 309 else: 310 raise Py4JError( Py4JJavaError: An error occurred while calling o128.showString. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 16.0 failed 1 times, most recent failure: Lost task 0.0 in stage 16.0 (TID 219, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p yspark/worker.py", line 111, in main process() File "/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p yspark/worker.py", line 106, in process serializer.dump_stream(func(split_index, iterator), outfile) File "/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/ pyspark.zip/pyspark/serializers.py", line 263, in dump_stream vs = list(itertools.islice(iterator, batch)) File "/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/funct ions.py", line 1563, in <lambda> func = lambda _, it: map(lambda x: returnType.toInternal(f(*x)), it) File "<ipython-input-15-9076fa544242>", line 28, in toSparseVector File "/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/ pyspark.zip/pyspark/mllib/linalg/__init__.py", line 827, in sparse return SparseVector(size, *args) File "/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/ pyspark.zip/pyspark/mllib/linalg/__init__.py", line 531, in __init__ raise TypeError("indices array must be sorted") TypeError: indices array must be sorted at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166) at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207) at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125) at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap ply(python.scala:405) at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap ply(python.scala:370) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD D.scala:710) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD D.scala:710) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:11 42) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:6 17) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGSchedu ler$$failJobAndIndependentStages(DAGScheduler.scala:1431) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGSched uler.scala:1419) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGSched uler.scala:1418) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5 9) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply (DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply (DAGScheduler.scala:799) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.sca la:799) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGSched uler.scala:1640) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSchedul er.scala:1599) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSchedul er.scala:1588) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858) at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:212) at org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:165 ) at org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scal a:174) at org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$exec ute$1$1.apply(DataFrame.scala:1499) at org.apache.spark.sql.DataFrame$$anonfun$org$apache$spark$sql$DataFrame$$exec ute$1$1.apply(DataFrame.scala:1499) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution .scala:56) at org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2086) at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$execute$1(Dat aFrame.scala:1498) at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$collect(DataF rame.scala:1505) at org.apache.spark.sql.DataFrame$$anonfun$head$1.apply(DataFrame.scala:1375) at org.apache.spark.sql.DataFrame$$anonfun$head$1.apply(DataFrame.scala:1374) at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2099) at org.apache.spark.sql.DataFrame.head(DataFrame.scala:1374) at org.apache.spark.sql.DataFrame.take(DataFrame.scala:1456) at org.apache.spark.sql.DataFrame.showString(DataFrame.scala:170) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62 ) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl .java:43) at java.lang.reflect.Method.invoke(Method.java:497) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) at py4j.Gateway.invoke(Gateway.java:259) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:209) at java.lang.Thread.run(Thread.java:745) Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p yspark/worker.py", line 111, in main process() File "/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/p yspark/worker.py", line 106, in process serializer.dump_stream(func(split_index, iterator), outfile) File "/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/ pyspark.zip/pyspark/serializers.py", line 263, in dump_stream vs = list(itertools.islice(iterator, batch)) File "/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pysp ark/sql/functions.py", line 1563, in <lambda> func = lambda _, it: map(lambda x: returnType.toInternal(f(*x)), it) File "<ipython-input-15-9076fa544242>", line 28, in toSparseVector File "/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/ pyspark.zip/pyspark/mllib/linalg/__init__.py", line 827, in sparse return SparseVector(size, *args) File "/Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/ pyspark.zip/pyspark/mllib/linalg/__init__.py", line 531, in __init__ raise TypeError("indices array must be sorted") TypeError: indices array must be sorted at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166) at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207) at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125) at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap ply(python.scala:405) at org.apache.spark.sql.execution.BatchPythonEvaluation$$anonfun$doExecute$1.ap ply(python.scala:370) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD D.scala:710) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RD D.scala:710) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:11 42) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:6 17) ... 1 more From: Jeff Zhang <zjf...@gmail.com> Date: Tuesday, March 29, 2016 at 10:34 PM To: Andrew Davidson <a...@santacruzintegration.com> Cc: "user @spark" <user@spark.apache.org> Subject: Re: pyspark unable to convert dataframe column to a vector: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient > According the stack trace, it seems the HiveContext is not initialized > correctly. Do you have any more error message ? > > On Tue, Mar 29, 2016 at 9:29 AM, Andy Davidson <a...@santacruzintegration.com> > wrote: >> I am using pyspark spark-1.6.1-bin-hadoop2.6 and python3. I have a data frame >> with a column I need to convert to a sparse vector. I get an exception >> >> Any idea what my bug is? >> >> Kind regards >> >> Andy >> >> >> Py4JJavaError: An error occurred while calling >> None.org.apache.spark.sql.hive.HiveContext. >> : java.lang.RuntimeException: java.lang.RuntimeException: Unable to >> instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient >> at >> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) >> at >> org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:204>> ) >> >> Here is my python code fragment with a more complete stack trace >> >> # load data set >> from pyspark.sql import HiveContext #,SQLContext, Row >> >> # window functions require HiveContext (spark 2.x will not require hive) >> #sqlContext = SQLContext(sc) >> hiveSqlContext = HiveContext(sc) >> >> … >> >> import numpy as np >> from pyspark.mllib.linalg import Vectors >> from pyspark.mllib.linalg import VectorUDT >> >> #sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) >> # = 3 = size >> # [0,1] int indices >> #[1.0, 3.0] values >> >> >> """ >> root >> |-- id: string (nullable = true) >> |-- samples: array (nullable = true) >> | |-- element: struct (containsNull = true) >> | | |-- id: long (nullable = false) >> | | |-- rateStr: string (nullable = false) >> >> """ >> >> def toSparseVector(pojoList) : >> indicies = [] >> for pojo in pojoList : >> indicies.append(pojo.id <http://pojo.id> ) >> >> l = np.ones(len(indicies)) >> v = Vectors.spark(numDimensions, indicies, l) >> return v >> >> myUDF = udf(toSparseVector, VectorUDT())) >> features = df.withColumn(newColName, myUDF(df[“samples"])) >> >> >> Py4JJavaError Traceback (most recent call last) >> <ipython-input-77-30ab820130a0> in <module>() 30 #myUDF = udf(lambda >> pojoList: labelStr if (labelStr == "noise") else "injury", StringType()) >> 31 ---> 32 myUDF = udf(toSparseVector, VectorUDT()) # 33 features = >> df.withColumn(newColName, >> myUDF(df["follows"]))/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/pytho >> n/pyspark/sql/functions.py in udf(f, returnType) 1595 [Row(slen=5), >> Row(slen=3)] 1596 """ >> -> 1597 return UserDefinedFunction(f, returnType) 1598 1599 >> blacklist = ['map', 'since', >> 'ignore_unicode_prefix']/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/py >> thon/pyspark/sql/functions.py in __init__(self, func, returnType, name) >> 1556 self.returnType = returnType 1557 self._broadcast = >> None-> 1558 self._judf = self._create_judf(name) 1559 1560 >> def _create_judf(self, >> name):/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/f >> unctions.py in _create_judf(self, name) 1567 pickled_command, >> broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self) >> 1568 ctx = SQLContext.getOrCreate(sc)-> 1569 jdt = >> ctx._ssql_ctx.parseDataType(self.returnType.json()) 1570 if name is >> None: 1571 name = f.__name__ if hasattr(f, '__name__') else >> f.__class__.__name__/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python >> /pyspark/sql/context.py in _ssql_ctx(self) 681 try: 682 >> if not hasattr(self, '_scala_HiveContext'):--> 683 >> self._scala_HiveContext = self._get_hive_ctx() 684 return >> self._scala_HiveContext 685 except Py4JError as >> e:/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/conte >> xt.py in _get_hive_ctx(self) 690 691 def _get_hive_ctx(self):--> >> 692 return self._jvm.HiveContext(self._jsc.sc()) 693 694 >> def refreshTable(self, >> tableName):/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j >> -0.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 1062 >> answer = self._gateway_client.send_command(command) 1063 >> return_value = get_return_value( >> -> 1064 answer, self._gateway_client, None, self._fqn) >> 1065 1066 for temp_arg in >> temp_args:/Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/s >> ql/utils.py in deco(*a, **kw) 43 def deco(*a, **kw): 44 >> try:---> 45 return f(*a, **kw) 46 except >> py4j.protocol.Py4JJavaError as e: 47 s = >> e.java_exception.toString()/Users/andrewdavidson/workSpace/spark/spark-1.6.1- >> bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/protocol.py in >> get_return_value(answer, gateway_client, target_id, name) 306 >> raise Py4JJavaError( >> 307 "An error occurred while calling >> {0}{1}{2}.\n".--> 308 format(target_id, ".", name), >> value) >> 309 else: 310 raise Py4JError( >> >> Py4JJavaError: An error occurred while calling >> None.org.apache.spark.sql.hive.HiveContext. >> : java.lang.RuntimeException: java.lang.RuntimeException: Unable to >> instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient >> at >> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) >> at >> org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:204>> ) >> at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) >> at >> sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccess >> orImpl.java:62) >> at >> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruct >> orAccessorImpl.java:45) >> at java.lang.reflect.Constructor.newInstance(Constructor.java:422) >> at >> org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedCl >> ientLoader.scala:249) >> at >> org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.sca >> la:327) >> at >> org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:237) >> at org.apache.spark.sql.hive.HiveContext.setConf(HiveContext.scala:441) >> at >> org.apache.spark.sql.hive.HiveContext.defaultOverrides(HiveContext.scala:226) >> at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:229) >> at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101) >> > > > > -- > Best Regards > > Jeff Zhang