[ https://issues.apache.org/jira/browse/ARROW-17136?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17635315#comment-17635315 ]
Alenka Frim commented on ARROW-17136: ------------------------------------- Thank you for reporting the issue [~asagarshinde] . Yes, there seems to be inconsistency between HadoopFileSystem implementation and its [base class FileSystem|https://arrow.apache.org/docs/dev/python/generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.open_append_stream], from where the docstring for {{open_append_stream}} is coming from. HadoopFileSystem doesn't create an empty file if the target doesn't exist. This behaviour is coming form the external implementation of Hadoop. What could be done on our side is to adapt the same behaviour on the HadoopFileSystem implementation as in the base FileSystem class with changing the C++ {{open_append_stream}} check so that it would, in case of non-existent target, create a new empty file. I don't think this is a priority for now, but contributions are much welcomed. > open_append_stream throwing an error if file does not exists > ------------------------------------------------------------ > > Key: ARROW-17136 > URL: https://issues.apache.org/jira/browse/ARROW-17136 > Project: Apache Arrow > Issue Type: Bug > Components: Python > Affects Versions: 8.0.0 > Reporter: Sagar Shinde > Priority: Minor > > as per the document method, open_append_stream will create the file if does > not exists. But when I try to append to the file in hdfs it is throwing an > error like file, not found. > hdfsOpenFile(/tmp/xyz.json): > FileSystem#append((Lorg/apache/hadoop/fs/Path;)Lorg/apache/hadoop/fs/FSDataOutputStream;) > error: > RemoteException: Failed to append to non-existent file /tmp/xyz.json for > client > at > org.apache.hadoop.hdfs.server.namenode.FSDirAppendOp.appendFile(FSDirAppendOp.java:104) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.appendFile(FSNamesystem.java:2639) > at > org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.append(NameNodeRpcServer.java:805) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.append(ClientNamenodeProtocolServerSideTranslatorPB.java:487) > at > org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:524) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1025) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:876) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:822) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2682) > java.io.FileNotFoundException: Failed to append to non-existent file > /tmp/xyz.json for client x.x.x.x > at > org.apache.hadoop.hdfs.server.namenode.FSDirAppendOp.appendFile(FSDirAppendOp.java:104) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.appendFile(FSNamesystem.java:2639) > at > org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.append(NameNodeRpcServer.java:805) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.append(ClientNamenodeProtocolServerSideTranslatorPB.java:487) > at > org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:524) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1025) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:876) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:822) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2682) > at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native > Method) > at > sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) > at > sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) > at java.lang.reflect.Constructor.newInstance(Constructor.java:423) > at > org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:121) > at > org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:88) > at org.apache.hadoop.hdfs.DFSClient.callAppend(DFSClient.java:1367) > at org.apache.hadoop.hdfs.DFSClient.append(DFSClient.java:1424) > at org.apache.hadoop.hdfs.DFSClient.append(DFSClient.java:1394) > at > org.apache.hadoop.hdfs.DistributedFileSystem$5.doCall(DistributedFileSystem.java:423) > at > org.apache.hadoop.hdfs.DistributedFileSystem$5.doCall(DistributedFileSystem.java:419) > at > org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) > at > org.apache.hadoop.hdfs.DistributedFileSystem.append(DistributedFileSystem.java:431) > at > org.apache.hadoop.hdfs.DistributedFileSystem.append(DistributedFileSystem.java:400) > at org.apache.hadoop.fs.FileSystem.append(FileSystem.java:1386) > Caused by: > org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): Failed > to append to non-existent file /tmp/xyz.json for client 10.128.8.11 > at > org.apache.hadoop.hdfs.server.namenode.FSDirAppendOp.appendFile(FSDirAppendOp.java:104) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.appendFile(FSNamesystem.java:2639) > at > org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.append(NameNodeRpcServer.java:805) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.append(ClientNamenodeProtocolServerSideTranslatorPB.java:487) > at > org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:524) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1025) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:876) > at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:822) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2682) > at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1562) > at org.apache.hadoop.ipc.Client.call(Client.java:1508) > at org.apache.hadoop.ipc.Client.call(Client.java:1405) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:233) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:118) > at com.sun.proxy.$Proxy9.append(Unknown Source) > at > org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.append(ClientNamenodeProtocolTranslatorPB.java:403) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422) > at > org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165) > at > org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157) > at > org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95) > at > org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359) > at com.sun.proxy.$Proxy10.append(Unknown Source) > at org.apache.hadoop.hdfs.DFSClient.callAppend(DFSClient.java:1333) > at org.apache.hadoop.hdfs.DFSClient.callAppend(DFSClient.java:1355) > ... 8 more > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "pyarrow/_fs.pyx", line 738, in > pyarrow._fs.FileSystem.open_append_stream > File "pyarrow/error.pxi", line 144, in > pyarrow.lib.pyarrow_internal_check_status > File "pyarrow/error.pxi", line 113, in pyarrow.lib.check_status > FileNotFoundError: [Errno 2] Opening HDFS file '/tmp/xyz.json' failed. > Detail: [errno 2] No such file or directory -- This message was sent by Atlassian Jira (v8.20.10#820010)