Repository: hadoop Updated Branches: refs/heads/HADOOP-13345 8e257a406 -> 2d0684292
HADOOP-13760: S3Guard: add new classes Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/2d068429 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/2d068429 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/2d068429 Branch: refs/heads/HADOOP-13345 Commit: 2d0684292bb8cd77509e71830ee047e788057a05 Parents: 8e257a4 Author: Sean Mackrory <mackror...@apache.org> Authored: Thu May 25 07:11:20 2017 -0600 Committer: Sean Mackrory <mackror...@apache.org> Committed: Thu May 25 07:11:20 2017 -0600 ---------------------------------------------------------------------- .../s3guard/MetadataStoreListFilesIterator.java | 168 +++++++++++++++++++ .../org/apache/hadoop/fs/s3a/TestListing.java | 94 +++++++++++ 2 files changed, 262 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/2d068429/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java new file mode 100644 index 0000000..272b1f4 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java @@ -0,0 +1,168 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.s3guard; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Queue; +import java.util.Set; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@code MetadataStoreListFilesIterator} is a {@link RemoteIterator} that + * is similar to {@code DescendantsIterator} but does not return directories + * that have (or may have) children, and will also provide access to the set of + * tombstones to allow recently deleted S3 objects to be filtered out from a + * corresponding request. In other words, it returns tombstones and the same + * set of objects that should exist in S3: empty directories, and files, and not + * other directories whose existence is inferred therefrom. + * + * For example, assume the consistent store contains metadata representing this + * file system structure: + * + * <pre> + * /dir1 + * |-- dir2 + * | |-- file1 + * | `-- file2 + * `-- dir3 + * |-- dir4 + * | `-- file3 + * |-- dir5 + * | `-- file4 + * `-- dir6 + * </pre> + * + * Consider this code sample: + * <pre> + * final PathMetadata dir1 = get(new Path("/dir1")); + * for (MetadataStoreListFilesIterator files = + * new MetadataStoreListFilesIterator(dir1); files.hasNext(); ) { + * final FileStatus status = files.next().getFileStatus(); + * System.out.printf("%s %s%n", status.isDirectory() ? 'D' : 'F', + * status.getPath()); + * } + * </pre> + * + * The output is: + * <pre> + * F /dir1/dir2/file1 + * F /dir1/dir2/file2 + * F /dir1/dir3/dir4/file3 + * F /dir1/dir3/dir5/file4 + * D /dir1/dir3/dir6 + * </pre> + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class MetadataStoreListFilesIterator implements + RemoteIterator<FileStatus> { + public static final Logger LOG = LoggerFactory.getLogger( + MetadataStoreListFilesIterator.class); + + private final boolean allowAuthoritative; + private final MetadataStore metadataStore; + private final Set<Path> tombstones = new HashSet<>(); + private Iterator<FileStatus> leafNodesIterator = null; + + public MetadataStoreListFilesIterator(MetadataStore ms, PathMetadata meta, + boolean allowAuthoritative) throws IOException { + Preconditions.checkNotNull(ms); + this.metadataStore = ms; + this.allowAuthoritative = allowAuthoritative; + prefetch(meta); + } + + private void prefetch(PathMetadata meta) throws IOException { + final Queue<PathMetadata> queue = new LinkedList<>(); + final Collection<FileStatus> leafNodes = new ArrayList<>(); + + if (meta != null) { + final Path path = meta.getFileStatus().getPath(); + if (path.isRoot()) { + DirListingMetadata rootListing = metadataStore.listChildren(path); + if (rootListing != null) { + tombstones.addAll(rootListing.listTombstones()); + queue.addAll(rootListing.withoutTombstones().getListing()); + } + } else { + queue.add(meta); + } + } + + while(!queue.isEmpty()) { + PathMetadata nextMetadata = queue.poll(); + FileStatus nextStatus = nextMetadata.getFileStatus(); + if (nextStatus.isFile()) { + // All files are leaf nodes by definition + leafNodes.add(nextStatus); + continue; + } + if (nextStatus.isDirectory()) { + final Path path = nextStatus.getPath(); + DirListingMetadata children = metadataStore.listChildren(path); + if (children != null) { + tombstones.addAll(children.listTombstones()); + Collection<PathMetadata> liveChildren = + children.withoutTombstones().getListing(); + if (!liveChildren.isEmpty()) { + // If it's a directory, has children, not all deleted, then we + // add the children to the queue and move on to the next node + queue.addAll(liveChildren); + continue; + } else if (allowAuthoritative && children.isAuthoritative()) { + leafNodes.add(nextStatus); + } + } + } + // Directories that *might* be empty are ignored for now, since we + // cannot confirm that they are empty without incurring other costs. + // Users of this class can still discover empty directories via S3's + // fake directories, subject to the same consistency semantics as before. + // The only other possibility is a symlink, which is unsupported on S3A. + } + leafNodesIterator = leafNodes.iterator(); + } + + @Override + public boolean hasNext() { + return leafNodesIterator.hasNext(); + } + + @Override + public FileStatus next() { + return leafNodesIterator.next(); + } + + public Set<Path> listTombstones() { + return tombstones; + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/2d068429/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java new file mode 100644 index 0000000..43eb2c0 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * Place for the S3A listing classes; keeps all the small classes under control. + */ +public class TestListing extends AbstractS3AMockTest { + + private static class MockRemoteIterator<FileStatus> implements + RemoteIterator<FileStatus> { + private Iterator<FileStatus> iterator; + + MockRemoteIterator(Collection<FileStatus> source) { + iterator = source.iterator(); + } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public FileStatus next() { + return iterator.next(); + } + } + + private FileStatus blankFileStatus(Path path) { + return new FileStatus(0, true, 0, 0, 0, path); + } + + @Test + public void testTombstoneReconcilingIterator() throws Exception { + Path parent = new Path("/parent"); + Path liveChild = new Path(parent, "/liveChild"); + Path deletedChild = new Path(parent, "/deletedChild"); + Path[] allFiles = {parent, liveChild, deletedChild}; + Path[] liveFiles = {parent, liveChild}; + + Listing listing = new Listing(fs); + Collection<FileStatus> statuses = new ArrayList<>(); + statuses.add(blankFileStatus(parent)); + statuses.add(blankFileStatus(liveChild)); + statuses.add(blankFileStatus(deletedChild)); + + Set<Path> tombstones = new HashSet<>(); + tombstones.add(deletedChild); + + RemoteIterator<FileStatus> sourceIterator = new MockRemoteIterator( + statuses); + RemoteIterator<LocatedFileStatus> locatedIterator = + listing.createLocatedFileStatusIterator(sourceIterator); + RemoteIterator<LocatedFileStatus> reconcilingIterator = + listing.createTombstoneReconcilingIterator(locatedIterator, tombstones); + + Set<Path> expectedPaths = new HashSet<>(); + expectedPaths.add(parent); + expectedPaths.add(liveChild); + + Set<Path> actualPaths = new HashSet<>(); + while (reconcilingIterator.hasNext()) { + actualPaths.add(reconcilingIterator.next().getPath()); + } + Assert.assertTrue(actualPaths.equals(expectedPaths)); + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org