According the stack trace, it seems the HiveContext is not initialized correctly. Do you have any more error message ?
On Tue, Mar 29, 2016 at 9:29 AM, Andy Davidson < a...@santacruzintegration.com> wrote: > I am using pyspark spark-1.6.1-bin-hadoop2.6 and python3. I have a data > frame with a column I need to convert to a sparse vector. I get an > exception > > Any idea what my bug is? > > Kind regards > > Andy > > > Py4JJavaError: An error occurred while calling > None.org.apache.spark.sql.hive.HiveContext. > : java.lang.RuntimeException: java.lang.RuntimeException: Unable to > instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient > at > org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) > at > org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:204) > > > *Here is my python code fragment with a more complete stack trace* > > # load data set > from pyspark.sql import HiveContext #,SQLContext, Row > > # window functions require HiveContext (spark 2.x will not require hive) > #sqlContext = SQLContext(sc) > hiveSqlContext = HiveContext(sc) > > … > > import numpy as np > from pyspark.mllib.linalg import Vectors > from pyspark.mllib.linalg import VectorUDT > > #sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0]) > # = 3 = size > # [0,1] int indices > #[1.0, 3.0] values > > > """ > root > |-- id: string (nullable = true) > |-- samples: array (nullable = true) > | |-- element: struct (containsNull = true) > | | |-- id: long (nullable = false) > | | |-- rateStr: string (nullable = false) > > """ > > def toSparseVector(pojoList) : > indicies = [] > for pojo in pojoList : > indicies.append(pojo.id) > > l = np.ones(len(indicies)) > v = Vectors.spark(numDimensions, indicies, l) > return v > > myUDF = udf(toSparseVector, VectorUDT())) > features = df.withColumn(newColName, myUDF(df[“samples"])) > > > Py4JJavaError Traceback (most recent call > last)<ipython-input-77-30ab820130a0> in <module>() 30 #myUDF = udf(lambda > pojoList: labelStr if (labelStr == "noise") else "injury", StringType()) > 31 ---> 32 myUDF = udf(toSparseVector, VectorUDT()) # 33 features = > df.withColumn(newColName, myUDF(df["follows"])) > /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functions.py > in udf(f, returnType) 1595 [Row(slen=5), Row(slen=3)] 1596 """-> > 1597 return UserDefinedFunction(f, returnType) 1598 1599 blacklist = > ['map', 'since', 'ignore_unicode_prefix'] > /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functions.py > in __init__(self, func, returnType, name) 1556 self.returnType = > returnType 1557 self._broadcast = None-> 1558 self._judf = > self._create_judf(name) 1559 1560 def _create_judf(self, name): > /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/functions.py > in _create_judf(self, name) 1567 pickled_command, broadcast_vars, > env, includes = _prepare_for_python_RDD(sc, command, self) 1568 ctx > = SQLContext.getOrCreate(sc)-> 1569 jdt = > ctx._ssql_ctx.parseDataType(self.returnType.json()) 1570 if name is > None: 1571 name = f.__name__ if hasattr(f, '__name__') else > f.__class__.__name__ > /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/context.py > in _ssql_ctx(self) 681 try: 682 if not > hasattr(self, '_scala_HiveContext'):--> 683 > self._scala_HiveContext = self._get_hive_ctx() 684 return > self._scala_HiveContext 685 except Py4JError as e: > /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/context.py > in _get_hive_ctx(self) 690 691 def _get_hive_ctx(self):--> 692 > return self._jvm.HiveContext(self._jsc.sc()) 693 694 def > refreshTable(self, tableName): > /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py > in __call__(self, *args) 1062 answer = > self._gateway_client.send_command(command) 1063 return_value = > get_return_value(-> 1064 answer, self._gateway_client, None, > self._fqn) 1065 1066 for temp_arg in temp_args: > /Users/f/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py > in deco(*a, **kw) 43 def deco(*a, **kw): 44 try:---> 45 > return f(*a, **kw) 46 except > py4j.protocol.Py4JJavaError as e: 47 s = > e.java_exception.toString() > /Users/andrewdavidson/workSpace/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/protocol.py > in get_return_value(answer, gateway_client, target_id, name) 306 > raise Py4JJavaError( 307 "An error occurred > while calling {0}{1}{2}.\n".--> 308 format(target_id, > ".", name), value) 309 else: 310 raise > Py4JError( > Py4JJavaError: An error occurred while calling > None.org.apache.spark.sql.hive.HiveContext. > : java.lang.RuntimeException: java.lang.RuntimeException: Unable to > instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient > at > org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) > at > org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:204) > at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) > at > sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) > at > sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) > at java.lang.reflect.Constructor.newInstance(Constructor.java:422) > at > org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:249) > at > org.apache.spark.sql.hive.HiveContext.metadataHive$lzycompute(HiveContext.scala:327) > at > org.apache.spark.sql.hive.HiveContext.metadataHive(HiveContext.scala:237) > at org.apache.spark.sql.hive.HiveContext.setConf(HiveContext.scala:441) > at > org.apache.spark.sql.hive.HiveContext.defaultOverrides(HiveContext.scala:226) > at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:229) > at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101) > > > -- Best Regards Jeff Zhang