[ https://issues.apache.org/jira/browse/HADOOP-18252?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Aaron Whiteway resolved HADOOP-18252. ------------------------------------- Resolution: Invalid > Hadoop 3.3.3 Spark write Mode.Overwrite breaks partitioned tables > ----------------------------------------------------------------- > > Key: HADOOP-18252 > URL: https://issues.apache.org/jira/browse/HADOOP-18252 > Project: Hadoop Common > Issue Type: Bug > Reporter: Aaron Whiteway > Priority: Major > > During testing Hadoop 3.3.3 with S3A with Versioning enabled ran into an > issue where spark/hadoop tries to load the partitions that don't exist anymore > > {noformat} > --------------------------------------------------------------------------- > Py4JJavaError Traceback (most recent call last) > <ipython-input-19-7f082b1a75b6> in <module> > ----> 1 test_load = spark.read.parquet(test_loc) > /usr/local/spark/python/pyspark/sql/readwriter.py in parquet(self, *paths, > **options) > 456 modifiedAfter=modifiedAfter) > 457 > --> 458 return > self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths))) > 459 > 460 def text(self, paths, wholetext=False, lineSep=None, > pathGlobFilter=None, > /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in > __call__(self, *args) > 1302 > 1303 answer = self.gateway_client.send_command(command) > -> 1304 return_value = get_return_value( > 1305 answer, self.gateway_client, self.target_id, self.name) > 1306 > /usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw) > 109 def deco(*a, **kw): > 110 try: > --> 111 return f(*a, **kw) > 112 except py4j.protocol.Py4JJavaError as e: > 113 converted = convert_exception(e.java_exception) > /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in > get_return_value(answer, gateway_client, target_id, name) > 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) > 325 if answer[1] == REFERENCE_TYPE: > --> 326 raise Py4JJavaError( > 327 "An error occurred while calling {0}{1}{2}.\n". > 328 format(target_id, ".", name), value) > Py4JJavaError: An error occurred while calling o183.parquet. > : java.io.FileNotFoundException: No such file or directory: > s3a://test/s32/singleday_parts_simple2/Part=TESTING_1 > at > org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2269) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2163) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2102) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.innerListStatus(S3AFileSystem.java:1903) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$listStatus$9(S3AFileSystem.java:1882) > at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:109) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.listStatus(S3AFileSystem.java:1882) > at > org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:225) > at > org.apache.spark.util.HadoopFSUtils$.$anonfun$listLeafFiles$7(HadoopFSUtils.scala:281) > at > scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245) > at > scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) > at > scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) > at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) > at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245) > at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242) > at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198) > at > org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:271) > at > org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$1(HadoopFSUtils.scala:95) > at > scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238) > at > scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at > scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) > at scala.collection.TraversableLike.map(TraversableLike.scala:238) > at scala.collection.TraversableLike.map$(TraversableLike.scala:231) > at scala.collection.AbstractTraversable.map(Traversable.scala:108) > at > org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:85) > at > org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:69) > at > org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:158) > at > org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:131) > at > org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:94) > at > org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:66) > at > org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:581) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:417) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:325) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:307) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:307) > at > org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:833) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:238) > at java.lang.Thread.run(Thread.java:748){noformat} > There used to be data in the partition but after an write overwrite > > {noformat} > % mc ls -r databcp/test/s32/singleday_parts_simple2/Part=TESTING_1/ > % mc ls -r --versions databcp/test/s32/singleday_parts_simple2/Part=TESTING_1/ > [2022-05-24 08:38:13 EDT] 0B STANDARD > 89ca1b4d-492c-43fd-9f20-ac6ba5a7beaa v2 DEL > part-00000-039c3462-a522-4f6c-8614-7fe2cdedc3af.c000.snappy.parquet > [2022-05-24 08:37:27 EDT] 81MiB STANDARD > f9994b02-ee92-4518-aa73-db0c2c6b6d7f v1 PUT > part-00000-039c3462-a522-4f6c-8614-7fe2cdedc3af.c000.snappy.parquet > [2022-05-20 11:12:35 EDT] 0B STANDARD > 0bce060a-4293-4936-957f-2b139063e9d7 v2 DEL > part-00000-355353a2-638a-4004-a6b2-e81b8e9f8960.c000.snappy.parquet > [2022-05-20 11:11:50 EDT] 81MiB STANDARD > 8f6b1310-4709-4312-be5d-0805f215b868 v1 PUT > part-00000-355353a2-638a-4004-a6b2-e81b8e9f8960.c000.snappy.parquet > [2022-05-20 11:07:03 EDT] 0B STANDARD > b0418804-fcff-4edb-8bd0-afa04522d066 v2 DEL > part-00000-7f486fe0-ec9e-451e-9b26-68d9dd253c2f.c000.snappy.parquet > [2022-05-20 11:05:38 EDT] 81MiB STANDARD > 9acaa83f-1842-4e9f-b56a-a65a46d7eac4 v1 PUT > part-00000-7f486fe0-ec9e-451e-9b26-68d9dd253c2f.c000.snappy.parquet > [2022-05-20 11:21:04 EDT] 0B STANDARD > 54fb3801-8a59-4efd-9818-fc1403f4b5fd v2 DEL > part-00000-89b521eb-447d-4354-90a5-2750157763ea.c000.snappy.parquet > [2022-05-20 11:15:27 EDT] 81MiB STANDARD > c4bae059-6e32-47ed-b461-7f1cd9d78a71 v1 PUT > part-00000-89b521eb-447d-4354-90a5-2750157763ea.c000.snappy.parquet > [2022-05-24 08:38:13 EDT] 0B STANDARD > 5bf22dba-9e37-4036-a94e-6346ca65c86c v2 DEL > part-00001-039c3462-a522-4f6c-8614-7fe2cdedc3af.c000.snappy.parquet > [2022-05-24 08:37:27 EDT] 81MiB STANDARD > 82ce73b4-0716-4d7b-9301-5b248aa67063 v1 PUT > part-00001-039c3462-a522-4f6c-8614-7fe2cdedc3af.c000.snappy.parquet > [2022-05-20 11:12:35 EDT] 0B STANDARD > a357d608-edfb-4f2c-b753-6dc85556b427 v2 DEL > part-00001-355353a2-638a-4004-a6b2-e81b8e9f8960.c000.snappy.parquet > [2022-05-20 11:11:50 EDT] 81MiB STANDARD > 4817edc0-d35b-4a30-9ec2-ad70c8bcd352 v1 PUT > part-00001-355353a2-638a-4004-a6b2-e81b8e9f8960.c000.snappy.parquet > [2022-05-20 11:07:03 EDT] 0B STANDARD > daabd5ae-630f-45b7-bdf9-0ec1cd876157 v2 DEL > part-00001-7f486fe0-ec9e-451e-9b26-68d9dd253c2f.c000.snappy.parquet > [2022-05-20 11:05:38 EDT] 81MiB STANDARD > 63a33b13-3001-4b42-bc54-c648756e9543 v1 PUT > part-00001-7f486fe0-ec9e-451e-9b26-68d9dd253c2f.c000.snappy.parquet > [2022-05-20 11:21:04 EDT] 0B STANDARD > 55a8213d-b0b5-4724-b53f-4ccf71d15bb7 v2 DEL > part-00001-89b521eb-447d-4354-90a5-2750157763ea.c000.snappy.parquet > [2022-05-20 11:15:27 EDT] 81MiB STANDARD > 1cada789-4e16-4241-bbf0-22db5507657f v1 PUT > part-00001-89b521eb-447d-4354-90a5-2750157763ea.c000.snappy.parquet > [2022-05-24 08:37:26 EDT] 0B STANDARD > 16568a5a-544c-4622-83a3-d46dc871defd v2 DEL > part-00002-3afd5aa6-dd6c-4077-8e41-19354e26bd59.c000.snappy.parquet > [2022-05-20 11:21:13 EDT] 81MiB STANDARD > a6e90fa8-e5dc-431e-a89b-30c45bea5884 v1 PUT > part-00002-3afd5aa6-dd6c-4077-8e41-19354e26bd59.c000.snappy.parquet > [2022-05-20 11:11:49 EDT] 0B STANDARD > ea751b23-1b98-4ff5-8e43-b71417f8d8a8 v2 DEL > part-00002-44214757-070f-43de-9853-cebdfd9b6543.c000.snappy.parquet > [2022-05-20 11:07:13 EDT] 81MiB STANDARD > 8338839d-00c8-4dc6-826f-1131f44e5ff7 v1 PUT > part-00002-44214757-070f-43de-9853-cebdfd9b6543.c000.snappy.parquet > [2022-05-20 11:15:25 EDT] 0B STANDARD > cd38cdcb-177a-4f8a-9f91-f0ad3aea2815 v2 DEL > part-00002-b8456920-9109-414f-880b-dbddc510f521.c000.snappy.parquet > [2022-05-20 11:12:36 EDT] 81MiB STANDARD > 7623ef59-79e1-4df5-a2c2-20f32deb9f28 v1 PUT > part-00002-b8456920-9109-414f-880b-dbddc510f521.c000.snappy.parquet > [2022-05-24 09:19:23 EDT] 0B STANDARD > de2f8f8d-f74a-4bd0-814f-fe74e53f08bc v2 DEL > part-00002-c01bad8a-f8f1-44fa-999a-d17d60d20c1a.c000.snappy.parquet > [2022-05-24 08:38:13 EDT] 81MiB STANDARD > 66815d49-da22-44f6-b4dd-b8040fdfc120 v1 PUT > part-00002-c01bad8a-f8f1-44fa-999a-d17d60d20c1a.c000.snappy.parquet > [2022-05-24 08:37:26 EDT] 0B STANDARD > 48993698-d03f-4f60-af6e-a4923d8c09c6 v2 DEL > part-00003-3afd5aa6-dd6c-4077-8e41-19354e26bd59.c000.snappy.parquet > [2022-05-20 11:21:29 EDT] 81MiB STANDARD > f2bfbe57-1241-470d-9e87-f34003fe9acb v1 PUT > part-00003-3afd5aa6-dd6c-4077-8e41-19354e26bd59.c000.snappy.parquet > [2022-05-20 11:11:49 EDT] 0B STANDARD > 1d695c38-fc42-4cfa-97f0-5ad2d64256da v2 DEL > part-00003-44214757-070f-43de-9853-cebdfd9b6543.c000.snappy.parquet > [2022-05-20 11:07:13 EDT] 81MiB STANDARD > ab5f2f1b-78e7-409c-ab17-1f6bc2259c18 v1 PUT > part-00003-44214757-070f-43de-9853-cebdfd9b6543.c000.snappy.parquet > [2022-05-20 11:15:25 EDT] 0B STANDARD > 23c0b362-fe97-431f-a4c5-fdfe2c86826b v2 DEL > part-00003-b8456920-9109-414f-880b-dbddc510f521.c000.snappy.parquet > [2022-05-20 11:12:37 EDT] 81MiB STANDARD > 2773bdb9-ae7e-49df-8d49-73ee0c9a4c3c v1 PUT > part-00003-b8456920-9109-414f-880b-dbddc510f521.c000.snappy.parquet > [2022-05-24 09:19:23 EDT] 0B STANDARD > fd7d5309-4be8-4d5d-a4d6-daa6ed957ccd v2 DEL > part-00003-c01bad8a-f8f1-44fa-999a-d17d60d20c1a.c000.snappy.parquet > [2022-05-24 08:38:14 EDT] 81MiB STANDARD > 2646ddc1-a32c-41e1-89fa-16805cbdb0f7 v1 PUT > part-00003-c01bad8a-f8f1-44fa-999a-d17d60d20c1a.c000.snappy.parquet{noformat} -- This message was sent by Atlassian Jira (v8.20.7#820007) --------------------------------------------------------------------- To unsubscribe, e-mail: common-dev-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-dev-h...@hadoop.apache.org