Hi I am using python3, Java8 and spark-1.6.1. I am running my code in Jupyter notebook
The following code runs fine on my mac udfRetType = ArrayType(StringType(), True) findEmojiUDF = udf(lambda s : re.findall(emojiPattern2, s), udfRetType) retDF = (emojiSpecialDF # convert into a list of emojis .select("body", findEmojiUDF(emojiSpecialDF.body).alias("listEmojis")) # explode , convert list of emojis into separate rows .select("*", functions.explode("listEmojis").alias("emoji")) ) retDF.printSchema() retDF.show(40, truncate=False) When I run it on my cluster I get Py4JJavaError: An error occurred while calling None.org.apache.spark.sql.hive.HiveContext. : java.lang.RuntimeException: org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a directory: /tmp tmp I check the files permissions. I start my notebook server as user ec2-user [ec2-user exploration]$ ls -ld /tmp drwxrwxrwt 5 root root 4096 Aug 18 18:14 /tmp In the cluster I use masterURL spark://ec2-54-215-230-73.us-west-1.compute.amazonaws.com:6066 (all my other spark code seems to work fine) Bellow is the complete stack trace Any idea what the problem is? Thanks Andy You must build Spark with Hive. Export 'SPARK_HIVE=true' and run build/sbt assembly Py4JJavaErrorTraceback (most recent call last) <ipython-input-15-f3e0066400d3> in <module>() 1 udfRetType = ArrayType(StringType(), True) ----> 2 findEmojiUDF = udf(lambda s : re.findall(emojiPattern2, s), udfRetType) 3 4 retDF = (emojiSpecialDF 5 # convert into a list of emojis /root/spark/python/pyspark/sql/functions.py in udf(f, returnType) 1595 [Row(slen=5), Row(slen=3)] 1596 """ -> 1597 return UserDefinedFunction(f, returnType) 1598 1599 blacklist = ['map', 'since', 'ignore_unicode_prefix'] /root/spark/python/pyspark/sql/functions.py in __init__(self, func, returnType, name) 1556 self.returnType = returnType 1557 self._broadcast = None -> 1558 self._judf = self._create_judf(name) 1559 1560 def _create_judf(self, name): /root/spark/python/pyspark/sql/functions.py in _create_judf(self, name) 1567 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self) 1568 ctx = SQLContext.getOrCreate(sc) -> 1569 jdt = ctx._ssql_ctx.parseDataType(self.returnType.json()) 1570 if name is None: 1571 name = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__ /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self) 681 try: 682 if not hasattr(self, '_scala_HiveContext'): --> 683 self._scala_HiveContext = self._get_hive_ctx() 684 return self._scala_HiveContext 685 except Py4JError as e: /root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self) 690 691 def _get_hive_ctx(self): --> 692 return self._jvm.HiveContext(self._jsc.sc()) 693 694 def refreshTable(self, tableName): /root/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 1062 answer = self._gateway_client.send_command(command) 1063 return_value = get_return_value( -> 1064 answer, self._gateway_client, None, self._fqn) 1065 1066 for temp_arg in temp_args: /root/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 43 def deco(*a, **kw): 44 try: ---> 45 return f(*a, **kw) 46 except py4j.protocol.Py4JJavaError as e: 47 s = e.java_exception.toString() /root/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 306 raise Py4JJavaError( 307 "An error occurred while calling {0}{1}{2}.\n". --> 308 format(target_id, ".", name), value) 309 else: 310 raise Py4JError( Py4JJavaError: An error occurred while calling None.org.apache.spark.sql.hive.HiveContext. : java.lang.RuntimeException: org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a directory: /tmp tmp at org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1 489) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys tem.java:2979) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j ava:2932) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java :2911) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS erver.java:649) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto bufRpcEngine.java:453) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja va:1408) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689) at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) at org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:20 4) at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedC lientLoader.scala:238) at org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.s cala:218) at org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:208) at org.apache.spark.sql.hive.HiveContext.functionRegistry$lzycompute(HiveContex t.scala:462) at org.apache.spark.sql.hive.HiveContext.functionRegistry(HiveContext.scala:461 ) at org.apache.spark.sql.UDFRegistration.<init>(UDFRegistration.scala:40) at org.apache.spark.sql.SQLContext.<init>(SQLContext.scala:330) at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:90) at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces sorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc torAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) at py4j.Gateway.invoke(Gateway.java:214) at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:7 9) at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68) at py4j.GatewayConnection.run(GatewayConnection.java:209) at java.lang.Thread.run(Thread.java:745) Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a directory: /tmp tmp at org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1 489) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys tem.java:2979) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j ava:2932) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java :2911) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS erver.java:649) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto bufRpcEngine.java:453) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja va:1408) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces sorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc torAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.j ava:90) at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException. java:57) at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2110) at org.apache.hadoop.hdfs.DFSClient.mkdirs(DFSClient.java:2079) at org.apache.hadoop.hdfs.DistributedFileSystem.mkdirs(DistributedFileSystem.ja va:543) at org.apache.hadoop.hive.ql.exec.Utilities.createDirsWithPermission(Utilities. java:3679) at org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionStat e.java:597) at org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionStat e.java:554) at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508) ... 21 more Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.fs.FileAlreadyExists Exception): Parent path is not a directory: /tmp tmp at org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1 489) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys tem.java:2979) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j ava:2932) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java :2911) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS erver.java:649) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto bufRpcEngine.java:453) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja va:1408) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689) at org.apache.hadoop.ipc.Client.call(Client.java:1225) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.jav a:202) at com.sun.proxy.$Proxy21.mkdirs(Unknown Source) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62 ) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl .java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocati onHandler.java:164) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHand ler.java:83) at com.sun.proxy.$Proxy21.mkdirs(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.mkdirs( ClientNamenodeProtocolTranslatorPB.java:425) at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2108) ... 27 more In [ ]: