[ https://issues.apache.org/jira/browse/SPARK-17143?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Andrew Davidson updated SPARK-17143: ------------------------------------ Attachment: udfBug.ipynb The attached notebook demonstrated the reported bug. Note it includes the output when run on my mac book pro. The bug report contains the stack trace when the same code is run in my data center > pyspark unable to create UDF: java.lang.RuntimeException: > org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a > directory: /tmp tmp > --------------------------------------------------------------------------------------------------------------------------------------------------- > > Key: SPARK-17143 > URL: https://issues.apache.org/jira/browse/SPARK-17143 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 1.6.1 > Environment: spark version: 1.6.1 > python version: 3.4.3 (default, Apr 1 2015, 18:10:40) > [GCC 4.8.2 20140120 (Red Hat 4.8.2-16)] > Reporter: Andrew Davidson > Attachments: udfBug.ipynb > > > For unknown reason I can not create UDF when I run the attached notebook on > my cluster. I get the following error > Py4JJavaError: An error occurred while calling > None.org.apache.spark.sql.hive.HiveContext. > : java.lang.RuntimeException: > org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a > directory: /tmp tmp > The notebook runs fine on my Mac > In general I am able to run non UDF spark code with out any trouble > I start the notebook server as the user “ec2-user" and uses master URL > spark://ec2-51-215-120-63.us-west-1.compute.amazonaws.com:6066 > I found the following message in the notebook server log file. I have log > level set to warn > 16/08/18 21:38:45 WARN ObjectStore: Version information not found in > metastore. hive.metastore.schema.verification is not enabled so recording the > schema version 1.2.0 > 16/08/18 21:38:45 WARN ObjectStore: Failed to get database default, returning > NoSuchObjectException > The cluster was originally created using > spark-1.6.1-bin-hadoop2.6/ec2/spark-ec2 > #from pyspark.sql import SQLContext, HiveContext > #sqlContext = SQLContext(sc) > > #from pyspark.sql import DataFrame > #from pyspark.sql import functions > > from pyspark.sql.types import StringType > from pyspark.sql.functions import udf > > print("spark version: {}".format(sc.version)) > > import sys > print("python version: {}".format(sys.version)) > spark version: 1.6.1 > python version: 3.4.3 (default, Apr 1 2015, 18:10:40) > [GCC 4.8.2 20140120 (Red Hat 4.8.2-16)] > # functions.lower() raises > # py4j.Py4JException: Method lower([class java.lang.String]) does not exist > # work around define a UDF > toLowerUDFRetType = StringType() > #toLowerUDF = udf(lambda s : s.lower(), toLowerUDFRetType) > toLowerUDF = udf(lambda s : s.lower(), StringType()) > You must build Spark with Hive. Export 'SPARK_HIVE=true' and run build/sbt > assembly > Py4JJavaErrorTraceback (most recent call last) > <ipython-input-2-2e0f7c0bb4f9> in <module>() > 4 toLowerUDFRetType = StringType() > 5 #toLowerUDF = udf(lambda s : s.lower(), toLowerUDFRetType) > ----> 6 toLowerUDF = udf(lambda s : s.lower(), StringType()) > /root/spark/python/pyspark/sql/functions.py in udf(f, returnType) > 1595 [Row(slen=5), Row(slen=3)] > 1596 """ > -> 1597 return UserDefinedFunction(f, returnType) > 1598 > 1599 blacklist = ['map', 'since', 'ignore_unicode_prefix'] > /root/spark/python/pyspark/sql/functions.py in __init__(self, func, > returnType, name) > 1556 self.returnType = returnType > 1557 self._broadcast = None > -> 1558 self._judf = self._create_judf(name) > 1559 > 1560 def _create_judf(self, name): > /root/spark/python/pyspark/sql/functions.py in _create_judf(self, name) > 1567 pickled_command, broadcast_vars, env, includes = > _prepare_for_python_RDD(sc, command, self) > 1568 ctx = SQLContext.getOrCreate(sc) > -> 1569 jdt = ctx._ssql_ctx.parseDataType(self.returnType.json()) > 1570 if name is None: > 1571 name = f.__name__ if hasattr(f, '__name__') else > f.__class__.__name__ > /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self) > 681 try: > 682 if not hasattr(self, '_scala_HiveContext'): > --> 683 self._scala_HiveContext = self._get_hive_ctx() > 684 return self._scala_HiveContext > 685 except Py4JError as e: > /root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self) > 690 > 691 def _get_hive_ctx(self): > --> 692 return self._jvm.HiveContext(self._jsc.sc()) > 693 > 694 def refreshTable(self, tableName): > /root/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in > __call__(self, *args) > 1062 answer = self._gateway_client.send_command(command) > 1063 return_value = get_return_value( > -> 1064 answer, self._gateway_client, None, self._fqn) > 1065 > 1066 for temp_arg in temp_args: > /root/spark/python/pyspark/sql/utils.py in deco(*a, **kw) > 43 def deco(*a, **kw): > 44 try: > ---> 45 return f(*a, **kw) > 46 except py4j.protocol.Py4JJavaError as e: > 47 s = e.java_exception.toString() > /root/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in > get_return_value(answer, gateway_client, target_id, name) > 306 raise Py4JJavaError( > 307 "An error occurred while calling {0}{1}{2}.\n". > --> 308 format(target_id, ".", name), value) > 309 else: > 310 raise Py4JError( > Py4JJavaError: An error occurred while calling > None.org.apache.spark.sql.hive.HiveContext. > : java.lang.RuntimeException: > org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a > directory: /tmp tmp > at > org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1489) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesystem.java:2979) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.java:2932) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java:2911) > at > org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcServer.java:649) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417) > at > org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:453) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:415) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689) > at > org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522) > at > org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:204) > at > org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:238) > at > org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.scala:218) > at > org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:208) > at > org.apache.spark.sql.hive.HiveContext.functionRegistry$lzycompute(HiveContext.scala:462) > at > org.apache.spark.sql.hive.HiveContext.functionRegistry(HiveContext.scala:461) > at org.apache.spark.sql.UDFRegistration.<init>(UDFRegistration.scala:40) > at org.apache.spark.sql.SQLContext.<init>(SQLContext.scala:330) > at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:90) > at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101) > at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) > at > sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) > at > sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) > at java.lang.reflect.Constructor.newInstance(Constructor.java:423) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) > at py4j.Gateway.invoke(Gateway.java:214) > at > py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79) > at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68) > at py4j.GatewayConnection.run(GatewayConnection.java:209) > at java.lang.Thread.run(Thread.java:745) > Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is > not a directory: /tmp tmp > at > org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1489) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesystem.java:2979) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.java:2932) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java:2911) > at > org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcServer.java:649) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417) > at > org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:453) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:415) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689) > at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) > at > sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) > at > sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) > at java.lang.reflect.Constructor.newInstance(Constructor.java:423) > at > org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:90) > at > org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:57) > at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2110) > at org.apache.hadoop.hdfs.DFSClient.mkdirs(DFSClient.java:2079) > at > org.apache.hadoop.hdfs.DistributedFileSystem.mkdirs(DistributedFileSystem.java:543) > at > org.apache.hadoop.hive.ql.exec.Utilities.createDirsWithPermission(Utilities.java:3679) > at > org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionState.java:597) > at > org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionState.java:554) > at > org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508) > ... 21 more > Caused by: > org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.fs.FileAlreadyExistsException): > Parent path is not a directory: /tmp tmp > at > org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1489) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesystem.java:2979) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.java:2932) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java:2911) > at > org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcServer.java:649) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417) > at > org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:453) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:415) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689) > at org.apache.hadoop.ipc.Client.call(Client.java:1225) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:202) > at com.sun.proxy.$Proxy21.mkdirs(Unknown Source) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:164) > at > org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:83) > at com.sun.proxy.$Proxy21.mkdirs(Unknown Source) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.mkdirs(ClientNamenodeProtocolTranslatorPB.java:425) > at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2108) > ... 27 more -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org