Hi I am using python3, Java8 and spark-1.6.1. I am running my code in
Jupyter notebook

The following code runs fine on my mac

udfRetType = ArrayType(StringType(), True)
findEmojiUDF = udf(lambda s : re.findall(emojiPattern2, s), udfRetType)

retDF = (emojiSpecialDF
            # convert into a list of emojis
            .select("body",
findEmojiUDF(emojiSpecialDF.body).alias("listEmojis"))
            # explode , convert list of emojis into separate rows
            .select("*", functions.explode("listEmojis").alias("emoji"))
       )

retDF.printSchema()
retDF.show(40, truncate=False)

When I run it on my cluster I get

Py4JJavaError: An error occurred while calling
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException:
org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a
directory: /tmp tmp

I check the files permissions. I start my notebook server as user ec2-user


[ec2-user exploration]$ ls -ld /tmp

drwxrwxrwt 5 root root 4096 Aug 18 18:14 /tmp





In the cluster I use masterURL
spark://ec2-54-215-230-73.us-west-1.compute.amazonaws.com:6066 (all my other
spark code seems to work fine)


Bellow is the complete stack trace

Any idea what the problem is?

Thanks

Andy

You must build Spark with Hive. Export 'SPARK_HIVE=true' and run build/sbt
assembly

Py4JJavaErrorTraceback (most recent call last)
<ipython-input-15-f3e0066400d3> in <module>()
      1 udfRetType = ArrayType(StringType(), True)
----> 2 findEmojiUDF = udf(lambda s : re.findall(emojiPattern2, s),
udfRetType)
      3 
      4 retDF = (emojiSpecialDF
      5             # convert into a list of emojis

/root/spark/python/pyspark/sql/functions.py in udf(f, returnType)
   1595     [Row(slen=5), Row(slen=3)]
   1596     """
-> 1597     return UserDefinedFunction(f, returnType)
   1598 
   1599 blacklist = ['map', 'since', 'ignore_unicode_prefix']

/root/spark/python/pyspark/sql/functions.py in __init__(self, func,
returnType, name)
   1556         self.returnType = returnType
   1557         self._broadcast = None
-> 1558         self._judf = self._create_judf(name)
   1559 
   1560     def _create_judf(self, name):

/root/spark/python/pyspark/sql/functions.py in _create_judf(self, name)
   1567         pickled_command, broadcast_vars, env, includes =
_prepare_for_python_RDD(sc, command, self)
   1568         ctx = SQLContext.getOrCreate(sc)
-> 1569         jdt = ctx._ssql_ctx.parseDataType(self.returnType.json())
   1570         if name is None:
   1571             name = f.__name__ if hasattr(f, '__name__') else
f.__class__.__name__

/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    681         try:
    682             if not hasattr(self, '_scala_HiveContext'):
--> 683                 self._scala_HiveContext = self._get_hive_ctx()
    684             return self._scala_HiveContext
    685         except Py4JError as e:

/root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self)
    690 
    691     def _get_hive_ctx(self):
--> 692         return self._jvm.HiveContext(self._jsc.sc())
    693 
    694     def refreshTable(self, tableName):

/root/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in
__call__(self, *args)
   1062         answer = self._gateway_client.send_command(command)
   1063         return_value = get_return_value(
-> 1064             answer, self._gateway_client, None, self._fqn)
   1065 
   1066         for temp_arg in temp_args:

/root/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     43     def deco(*a, **kw):
     44         try:
---> 45             return f(*a, **kw)
     46         except py4j.protocol.Py4JJavaError as e:
     47             s = e.java_exception.toString()

/root/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
    306                 raise Py4JJavaError(
    307                     "An error occurred while calling {0}{1}{2}.\n".
--> 308                     format(target_id, ".", name), value)
    309             else:
    310                 raise Py4JError(

Py4JJavaError: An error occurred while calling
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException:
org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a
directory: /tmp tmp
        at 
org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1
489)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys
tem.java:2979)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j
ava:2932)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java
:2911)
        at 
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS
erver.java:649)
        at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator
PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417)
        at 
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam
enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto
bufRpcEngine.java:453)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja
va:1408)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689)

        at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
        at 
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:20
4)
        at 
org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedC
lientLoader.scala:238)
        at 
org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.s
cala:218)
        at 
org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:208)
        at 
org.apache.spark.sql.hive.HiveContext.functionRegistry$lzycompute(HiveContex
t.scala:462)
        at 
org.apache.spark.sql.hive.HiveContext.functionRegistry(HiveContext.scala:461
)
        at org.apache.spark.sql.UDFRegistration.<init>(UDFRegistration.scala:40)
        at org.apache.spark.sql.SQLContext.<init>(SQLContext.scala:330)
        at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:90)
        at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101)
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces
sorImpl.java:62)
        at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc
torAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
        at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
        at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
        at py4j.Gateway.invoke(Gateway.java:214)
        at 
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:7
9)
        at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
        at py4j.GatewayConnection.run(GatewayConnection.java:209)
        at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is
not a directory: /tmp tmp
        at 
org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1
489)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys
tem.java:2979)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j
ava:2932)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java
:2911)
        at 
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS
erver.java:649)
        at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator
PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417)
        at 
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam
enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto
bufRpcEngine.java:453)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja
va:1408)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689)

        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces
sorImpl.java:62)
        at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc
torAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
        at 
org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.j
ava:90)
        at 
org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.
java:57)
        at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2110)
        at org.apache.hadoop.hdfs.DFSClient.mkdirs(DFSClient.java:2079)
        at 
org.apache.hadoop.hdfs.DistributedFileSystem.mkdirs(DistributedFileSystem.ja
va:543)
        at 
org.apache.hadoop.hive.ql.exec.Utilities.createDirsWithPermission(Utilities.
java:3679)
        at 
org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionStat
e.java:597)
        at 
org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionStat
e.java:554)
        at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508)
        ... 21 more
Caused by: 
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.fs.FileAlreadyExists
Exception): Parent path is not a directory: /tmp tmp
        at 
org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1
489)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys
tem.java:2979)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j
ava:2932)
        at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java
:2911)
        at 
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS
erver.java:649)
        at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator
PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417)
        at 
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam
enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto
bufRpcEngine.java:453)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja
va:1408)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689)

        at org.apache.hadoop.ipc.Client.call(Client.java:1225)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.jav
a:202)
        at com.sun.proxy.$Proxy21.mkdirs(Unknown Source)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62
)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl
.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at 
org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocati
onHandler.java:164)
        at 
org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHand
ler.java:83)
        at com.sun.proxy.$Proxy21.mkdirs(Unknown Source)
        at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.mkdirs(
ClientNamenodeProtocolTranslatorPB.java:425)
        at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2108)
        ... 27 more


In [ ]:


Reply via email to