[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-10 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r322910443
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError() {
+  return "No PathMetadata for this path in the MS.";
+}
+  }
+
+  /**
+   * The violation handler when there's no parent entry.
+   */
+  public s

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-10 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r322909779
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError() {
+  return "No PathMetadata for this path in the MS.";
+}
+  }
+
+  /**
+   * The violation handler when there's no parent entry.
+   */
+  public s

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-10 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r322909421
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
 ##
 @@ -1449,7 +1449,7 @@ public boolean hasMetadataStore() {
* is set for this filesystem.
*/
   @VisibleForTesting
-  boolean hasAuthoritativeMetadataStore() {
+  public boolean hasAuthoritativeMetadataStore() {
 
 Review comment:
   we're only using this for tests, so I'm not as worried as I was. +1 for this 
change


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-10 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r322799509
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
 ##
 @@ -1449,7 +1449,7 @@ public boolean hasMetadataStore() {
* is set for this filesystem.
*/
   @VisibleForTesting
-  boolean hasAuthoritativeMetadataStore() {
+  public boolean hasAuthoritativeMetadataStore() {
 
 Review comment:
   If someone has configured the test path to be authoritative, then even if 
the store says non-auth you'll get auth-store-behaviours. Best to either unset 
the per-bucket/global auth mode settings in test config creation or check the 
auth mode of the actual test path.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-10 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r322797186
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -23,25 +23,26 @@
 import java.util.List;
 import java.util.UUID;
 
-import org.apache.hadoop.io.IOUtils;
+import org.assertj.core.api.Assertions;
 import org.junit.Before;
 import org.junit.Test;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
 import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
-import org.assertj.core.api.Assertions;
 
+import static org.apache.hadoop.fs.s3a.Constants.AUTHORITATIVE_PATH;
+import static org.junit.Assume.assumeTrue;
 
 Review comment:
   not sure why this moved around


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-10 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r322796262
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -89,50 +89,53 @@
* The violations are listed in Enums: {@link Violation}
*
* @param p the root path to start the traversal
-   * @throws IOException
* @return a list of {@link ComparePair}
+   * @throws IOException
*/
   public List compareS3ToMs(Path p) throws IOException {
 Stopwatch stopwatch = Stopwatch.createStarted();
 int scannedItems = 0;
 
 final Path rootPath = rawFS.qualify(p);
-S3AFileStatus root = null;
-try {
-  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
-} catch (AWSBadRequestException e) {
-  throw new IOException(e.getMessage());
-}
+S3AFileStatus root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
 final List comparePairs = new ArrayList<>();
 final Queue queue = new ArrayDeque<>();
 queue.add(root);
 
 while (!queue.isEmpty()) {
   final S3AFileStatus currentDir = queue.poll();
-  scannedItems++;
+
 
   final Path currentDirPath = currentDir.getPath();
-  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
-
-  // DIRECTORIES
-  // Check directory authoritativeness consistency
-  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
-  // Add all descendant directory to the queue
-  s3DirListing.stream().filter(pm -> pm.isDirectory())
-  .map(S3AFileStatus.class::cast)
-  .forEach(pm -> queue.add(pm));
-
-  // FILES
-  // check files for consistency
-  final List children = s3DirListing.stream()
-  .filter(status -> !status.isDirectory())
-  .map(S3AFileStatus.class::cast).collect(toList());
-  final List compareResult =
-  compareS3DirToMs(currentDir, children).stream()
-  .filter(comparePair -> comparePair.containsViolation())
-  .collect(toList());
-  comparePairs.addAll(compareResult);
-  scannedItems += children.size();
+  try {
+List s3DirListing = Arrays.asList(
+rawFS.listStatus(currentDirPath));
+
+// Check authoritative directory flag.
+compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath,
+s3DirListing);
+// Add all descendant directory to the queue
+s3DirListing.stream().filter(pm -> pm.isDirectory())
+.map(S3AFileStatus.class::cast)
+.forEach(pm -> queue.add(pm));
+
+// Check file and directory metadata for consistency.
+final List children = s3DirListing.stream()
+.filter(status -> !status.isDirectory())
+.map(S3AFileStatus.class::cast).collect(toList());
+final List compareResult =
+compareS3DirContentToMs(currentDir, children);
+comparePairs.addAll(compareResult);
+
+// Increase the scanned file size.
+// One for the directory, one for the children.
+scannedItems++;
+scannedItems += children.size();
+  } catch (FileNotFoundException e) {
+LOG.error("The path has been deleted since it was queued: "
 
 Review comment:
   error or warn? And normally I'd go for a slf4j {} reference but as this is 
intended to always be logged, it'll do as is


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321800994
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321802607
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321806732
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321803184
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  // The rawFS and metadataStore are here to prepare when the ViolationHandlers
+  // will not just log, but fix the violations, so they will have access.
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
 
 Review comment:
   do ("{}", this) so that the toString is only invoked at debug level log


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321812144
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.awaitFileStatus;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Integration tests for the S3Guard Fsck against a dyamodb backed metadata
+ * store.
+ */
+public class ITestS3GuardFsck extends AbstractS3ATestBase {
+
+  private S3AFileSystem guardedFs;
+  private S3AFileSystem rawFS;
+
+  private MetadataStore metadataStore;
+
+  @Before
+  public void setup() throws Exception {
+super.setup();
+S3AFileSystem fs = getFileSystem();
+// These test will fail if no ms
+assertTrue("FS needs to have a metadatastore.",
+fs.hasMetadataStore());
+assertTrue("Metadatastore should persist authoritative bit",
+metadataStorePersistsAuthoritativeBit(fs.getMetadataStore()));
+
+guardedFs = fs;
+metadataStore = fs.getMetadataStore();
+
+// create raw fs without s3guard
+rawFS = createUnguardedFS();
+assertFalse("Raw FS still has S3Guard " + rawFS,
+rawFS.hasMetadataStore());
+  }
+
+  @Override
+  public void teardown() throws Exception {
+if (guardedFs != null) {
+  IOUtils.cleanupWithLogger(LOG, guardedFs);
+}
+IOUtils.cleanupWithLogger(LOG, rawFS);
+super.teardown();
+  }
+
+  /**
+   * Create a test filesystem which is always unguarded.
+   * This filesystem MUST be closed in test teardown.
+   * @return the new FS
+   */
+  private S3AFileSystem createUnguardedFS() throws Exception {
+S3AFileSystem testFS = getFileSystem();
+Configuration config = new Configuration(testFS.getConf());
+URI uri = testFS.getUri();
+
+removeBaseAndBucketOverrides(uri.getHost(), config,
+S3_METADATA_STORE_IMPL);
+removeBaseAndBucketOverrides(uri.getHost(), config,
+METADATASTORE_AUTHORITATIVE);
+S3AFileSystem fs2 = new S3AFileSystem();
+fs2.initialize(uri, config);
+return fs2;
+  }
+
+  @Test
+  public void testIDetectNoMetadataEntry() throws Exception {
+final Path cwd = path("/" + getMethodName() + "-" + UUID.randomUUID());
+final Path file = new Path(cwd, "file");
+try {
+  touch(rawFS, file);
+  awaitFileStatus(rawFS, file);
+
+  final S3GuardFsck s3GuardFsck =
+  new S3GuardFsck(rawFS, metadataStore);
+
+  final List comparePairs =
+  s3GuardFsck.compareS3ToMs(cwd);
+
+  assertEquals("Number of pairs should be two.", 2,
+  comparePairs.size());
+  final S3GuardFsck.ComparePair pair = comparePairs.get(0);
+  assertTrue("The pair must contain a violation.", 
pair.containsViolation());
+  assertEquals("The pair must contain only one violation", 1,
+  pair.getViolations().size());
+
+  final S3GuardFsck.Violation violation =
+  pair.getViolations().iterator().next();
+  assertEquals("The violation should be that there is no violation entry.",
+  violation, S3GuardFsck.Vi

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321806500
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316670881
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
 ##
 @@ -1485,6 +1486,89 @@ private void vprintln(PrintStream out, String format, 
Object...
 }
   }
 
+  /**
+   * Prune metadata that has not been modified recently.
+   */
+  static class Fsck extends S3GuardTool {
+public static final String CHECK_FLAG = "check";
+
+public static final String NAME = "fsck";
+public static final String PURPOSE = "Compares S3 with MetadataStore, and "
++ "returns a failure status if any rules or invariants are violated. "
++ "Only works with DynamoDbMetadataStore.";
 
 Review comment:
   how about "only works with DynamoDB metadata stores"


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321801869
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316660718
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return
+   */
+  public List compareS3RootToMs(Path p) throws IOException {
+final Path rootPath = rawFS.qualify(p);
+final S3AFileStatus root =
+(S3AFileStatus) rawFS.getFileStatus(rootPath);
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  // pop front node from the queue
+  final S3AFileStatus currentDir = queue.poll();
+
+  // Get a listing of that dir from s3 and add just the files.
+  // (Each directory will be added as a root.)
+  // Files should be casted to S3AFileStatus instead of plain FileStatus
+  // to get the VersionID and Etag.
+  final Path currentDirPath = currentDir.getPath();
+
+  final FileStatus[] s3DirListing = rawFS.listStatus(currentDirPath);
+  final List children =
+  Arrays.asList(s3DirListing).stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+
+  // Compare the directory contents if the listing is authoritative
+  final DirListingMetadata msDirListing =
+  metadataStore.listChildren(currentDirPath);
+  if (msDirListing != null && msDirListing.isAuthoritative()) {
+final ComparePair cP =
+compareAuthDirListing(s3DirListing, msDirListing);
+if (cP.containsViolation()) {
+  comparePairs.add(cP);
+}
+  }
+
+  // Compare directory and contents, but not the listing
+  final

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321801203
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321808553
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
 ##
 @@ -1485,6 +1486,93 @@ private void vprintln(PrintStream out, String format, 
Object...
 }
   }
 
+  /**
+   * Prune metadata that has not been modified recently.
+   */
+  static class Fsck extends S3GuardTool {
+public static final String CHECK_FLAG = "check";
+
+public static final String NAME = "fsck";
+public static final String PURPOSE = "Compares S3 with MetadataStore, and "
++ "returns a failure status if any rules or invariants are violated. "
++ "Only works with DynamoDbMetadataStore.";
+private static final String USAGE = NAME + " [OPTIONS] [s3a://BUCKET]\n" +
+"\t" + PURPOSE + "\n\n" +
+"Common options:\n" +
+"  " + CHECK_FLAG + " Check the metadata store for errors, but do "
++ "not fix any issues.\n";
+
+Fsck(Configuration conf) {
+  super(conf, CHECK_FLAG);
+}
+
+@Override
+public String getName() {
+  return NAME;
+}
+
+@Override
+public String getUsage() {
+  return USAGE;
+}
+
+public int run(String[] args, PrintStream out) throws
+InterruptedException, IOException {
+  List paths = parseArgs(args);
+  if (paths.isEmpty()) {
+out.println(USAGE);
+throw invalidArgs("no arguments");
+  }
+
+  String s3Path = paths.get(0);
+  try {
+initS3AFileSystem(s3Path);
+  } catch (Exception e) {
+errorln("Failed to initialize S3AFileSystem from path: " + s3Path);
+throw e;
+  }
+
+  URI uri = toUri(s3Path);
+  Path root;
+  if (uri.getPath().isEmpty()) {
+root = new Path("/");
+  } else {
+root = new Path(uri.getPath());
+  }
+
+  final S3AFileSystem fs = getFilesystem();
+  initMetadataStore(false);
+  final MetadataStore ms = getStore();
+
+  if (ms == null ||
+  !(ms instanceof DynamoDBMetadataStore)) {
+errorln(s3Path + " path uses MS: " + ms);
+errorln(NAME + " can be only used with a DynamoDB backed s3a bucket.");
+errorln(USAGE);
+return ERROR;
+  }
+
+  final CommandFormat commandFormat = getCommandFormat();
+  if (commandFormat.getOpt(CHECK_FLAG)) {
+// do the check
+S3GuardFsck s3GuardFsck = new S3GuardFsck(fs, ms);
+try {
+  s3GuardFsck.compareS3ToMs(fs.qualify(root));
+} catch (IOException e) {
+  errorln("Error while running the check: compareS3ToMs");
 
 Review comment:
   Is this needed; the runner logs anyway?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321803937
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  // The rawFS and metadataStore are here to prepare when the ViolationHandlers
+  // will not just log, but fix the violations, so they will have access.
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
 
 Review comment:
   I think we should change the log level based on the severity. This matters 
for those of us who have their log4j settings set to log different levels in 
different colours, and it will help people interpret the output


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316665014
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError() {
+  return "No PathMetadata for this path in the MS.";
+}
+  }
+
+  /**
+   * The violation handler when there's no parent entry.
+   */
+  public s

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321798717
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
 
 Review comment:
   This already is an IOE; no need to convert it to one (while losing the 
stack).


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321810743
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java
 ##
 @@ -289,4 +291,30 @@ public void testDestroyUnknownTable() throws Throwable {
 "-meta", "dynamodb://" + getTestTableName(DYNAMODB_TABLE));
   }
 
+  @Test
+  public void testCLIFsckWithoutParam() throws Exception {
+intercept(ExitUtil.ExitException.class, () -> run(Fsck.NAME));
+  }
+
+  @Test
+  public void testCLIFsckWithParam() throws Exception {
+final int result = run(S3GuardTool.Fsck.NAME, "-check",
+"s3a://" + getFileSystem().getBucket());
 
 Review comment:
   this test failed for me during a parallel run. This parallelizable test 
should have a path which we know is there but is private to this test; we can 
have another one which invokes on a missing path. The full root scan should be 
run in the ITestS3GuardDDBRootOperations test, before any cleanup


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321804202
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  // The rawFS and metadataStore are here to prepare when the ViolationHandlers
+  // will not just log, but fix the violations, so they will have access.
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
 
 Review comment:
   this should be public too, if the rest is


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316664767
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError() {
+  return "No PathMetadata for this path in the MS.";
+}
+  }
+
+  /**
+   * The violation handler when there's no parent entry.
+   */
+  public s

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316667252
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.awaitFileStatus;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Integration tests for the S3Guard Fsck against a dyamodb backed metadata
+ * store.
+ */
+public class ITestS3GuardFsck extends AbstractS3ATestBase {
+
+  private S3AFileSystem guardedFs;
+  private S3AFileSystem rawFS;
+
+  private MetadataStore metadataStore;
+
+  @Before
+  public void setup() throws Exception {
+super.setup();
+S3AFileSystem fs = getFileSystem();
+// These test will fail if no ms
+assertTrue("FS needs to have a metadatastore.",
+fs.hasMetadataStore());
+assertTrue("Metadatastore should persist authoritative bit",
+metadataStorePersistsAuthoritativeBit(fs.getMetadataStore()));
+
+guardedFs = fs;
+metadataStore = fs.getMetadataStore();
+
+// create raw fs without s3guard
+rawFS = createUnguardedFS();
+assertFalse("Raw FS still has S3Guard " + rawFS,
+rawFS.hasMetadataStore());
+  }
+
+  @Override
+  public void teardown() throws Exception {
+if (guardedFs != null) {
+  IOUtils.cleanupWithLogger(LOG, guardedFs);
+}
+IOUtils.cleanupWithLogger(LOG, rawFS);
+super.teardown();
+  }
+
+  /**
+   * Create a test filesystem which is always unguarded.
+   * This filesystem MUST be closed in test teardown.
+   * @return the new FS
+   */
+  private S3AFileSystem createUnguardedFS() throws Exception {
+S3AFileSystem testFS = getFileSystem();
+Configuration config = new Configuration(testFS.getConf());
+URI uri = testFS.getUri();
+
+removeBaseAndBucketOverrides(uri.getHost(), config,
+S3_METADATA_STORE_IMPL);
+removeBaseAndBucketOverrides(uri.getHost(), config,
+METADATASTORE_AUTHORITATIVE);
+S3AFileSystem fs2 = new S3AFileSystem();
+fs2.initialize(uri, config);
+return fs2;
+  }
+
+  @Test
+  public void testIDetectNoMetadataEntry() throws Exception {
+final Path cwd = path("/" + getMethodName() + "-" + UUID.randomUUID());
+final Path file = new Path(cwd, "file");
+try {
+  touch(rawFS, file);
+  awaitFileStatus(rawFS, file);
+
+  final S3GuardFsck s3GuardFsck =
+  new S3GuardFsck(rawFS, metadataStore);
+
+  final List comparePairs =
+  s3GuardFsck.compareS3RootToMs(cwd);
+
+  assertEquals("Number of pairs should be two.", 2,
+  comparePairs.size());
+  final S3GuardFsck.ComparePair pair = comparePairs.get(0);
+  assertTrue("The pair must contain a violation.", 
pair.containsViolation());
+  assertEquals("The pair must contain only one violation", 1,
+  pair.getViolations().size());
+
+  final S3GuardFsck.Violation violation =
+  pair.getViolations().iterator().next();
+  assertEquals("The violation should be that there is no violation entry.",
+  violation, S3GuardFsc

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321804874
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  // The rawFS and metadataStore are here to prepare when the ViolationHandlers
+  // will not just log, but fix the violations, so they will have access.
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError()

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321799619
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
 
 Review comment:
   flip the order of return and throws


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316668703
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.awaitFileStatus;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Integration tests for the S3Guard Fsck against a dyamodb backed metadata
+ * store.
+ */
+public class ITestS3GuardFsck extends AbstractS3ATestBase {
 
 Review comment:
there's a lot of commonality in all these test cases if possible we should 
factor that out so that most of the boilerplate code is reused


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321799219
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316663744
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
 
 Review comment:
   Imports


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316664318
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
 
 Review comment:
if you pulled this out into a static method it could be tested on its own


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321806663
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321800562
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
 
 Review comment:
   I think this needs to handle the possibility of the raw FS raising an FNFE, 
saying the path has been deleted since it was queued.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: co

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r31881
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.awaitFileStatus;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Integration tests for the S3Guard Fsck against a dyamodb backed metadata
+ * store.
+ */
+public class ITestS3GuardFsck extends AbstractS3ATestBase {
+
+  private S3AFileSystem guardedFs;
+  private S3AFileSystem rawFS;
+
+  private MetadataStore metadataStore;
+
+  @Before
+  public void setup() throws Exception {
+super.setup();
+S3AFileSystem fs = getFileSystem();
+// These test will fail if no ms
+assertTrue("FS needs to have a metadatastore.",
+fs.hasMetadataStore());
+assertTrue("Metadatastore should persist authoritative bit",
+metadataStorePersistsAuthoritativeBit(fs.getMetadataStore()));
+
+guardedFs = fs;
+metadataStore = fs.getMetadataStore();
+
+// create raw fs without s3guard
+rawFS = createUnguardedFS();
+assertFalse("Raw FS still has S3Guard " + rawFS,
+rawFS.hasMetadataStore());
+  }
+
+  @Override
+  public void teardown() throws Exception {
+if (guardedFs != null) {
+  IOUtils.cleanupWithLogger(LOG, guardedFs);
+}
+IOUtils.cleanupWithLogger(LOG, rawFS);
+super.teardown();
+  }
+
+  /**
+   * Create a test filesystem which is always unguarded.
+   * This filesystem MUST be closed in test teardown.
+   * @return the new FS
+   */
+  private S3AFileSystem createUnguardedFS() throws Exception {
+S3AFileSystem testFS = getFileSystem();
+Configuration config = new Configuration(testFS.getConf());
+URI uri = testFS.getUri();
+
+removeBaseAndBucketOverrides(uri.getHost(), config,
+S3_METADATA_STORE_IMPL);
+removeBaseAndBucketOverrides(uri.getHost(), config,
 
 Review comment:
   We need to remove authoritative paths too


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321805419
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  // The rawFS and metadataStore are here to prepare when the ViolationHandlers
+  // will not just log, but fix the violations, so they will have access.
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError()

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r31565
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.awaitFileStatus;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Integration tests for the S3Guard Fsck against a dyamodb backed metadata
+ * store.
+ */
+public class ITestS3GuardFsck extends AbstractS3ATestBase {
+
+  private S3AFileSystem guardedFs;
+  private S3AFileSystem rawFS;
+
+  private MetadataStore metadataStore;
+
+  @Before
+  public void setup() throws Exception {
+super.setup();
+S3AFileSystem fs = getFileSystem();
+// These test will fail if no ms
+assertTrue("FS needs to have a metadatastore.",
+fs.hasMetadataStore());
+assertTrue("Metadatastore should persist authoritative bit",
 
 Review comment:
   And here too


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321815770
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321703265
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
 ##
 @@ -1485,6 +1486,89 @@ private void vprintln(PrintStream out, String format, 
Object...
 }
   }
 
+  /**
+   * Prune metadata that has not been modified recently.
+   */
+  static class Fsck extends S3GuardTool {
+public static final String CHECK_FLAG = "check";
+
+public static final String NAME = "fsck";
+public static final String PURPOSE = "Compares S3 with MetadataStore, and "
++ "returns a failure status if any rules or invariants are violated. "
++ "Only works with DynamoDbMetadataStore.";
 
 Review comment:
   + say "-check" in usage so its clear that you need the prefix


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316662769
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return
+   */
+  public List compareS3RootToMs(Path p) throws IOException {
+final Path rootPath = rawFS.qualify(p);
+final S3AFileStatus root =
+(S3AFileStatus) rawFS.getFileStatus(rootPath);
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  // pop front node from the queue
+  final S3AFileStatus currentDir = queue.poll();
+
+  // Get a listing of that dir from s3 and add just the files.
+  // (Each directory will be added as a root.)
+  // Files should be casted to S3AFileStatus instead of plain FileStatus
+  // to get the VersionID and Etag.
+  final Path currentDirPath = currentDir.getPath();
+
+  final FileStatus[] s3DirListing = rawFS.listStatus(currentDirPath);
+  final List children =
+  Arrays.asList(s3DirListing).stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+
+  // Compare the directory contents if the listing is authoritative
+  final DirListingMetadata msDirListing =
+  metadataStore.listChildren(currentDirPath);
+  if (msDirListing != null && msDirListing.isAuthoritative()) {
+final ComparePair cP =
+compareAuthDirListing(s3DirListing, msDirListing);
+if (cP.containsViolation()) {
+  comparePairs.add(cP);
+}
+  }
+
+  // Compare directory and contents, but not the listing
+  final

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316665415
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError() {
+  return "No PathMetadata for this path in the MS.";
+}
+  }
+
+  /**
+   * The violation handler when there's no parent entry.
+   */
+  public s

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321807686
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
 ##
 @@ -1485,6 +1486,93 @@ private void vprintln(PrintStream out, String format, 
Object...
 }
   }
 
+  /**
+   * Prune metadata that has not been modified recently.
+   */
+  static class Fsck extends S3GuardTool {
+public static final String CHECK_FLAG = "check";
+
+public static final String NAME = "fsck";
+public static final String PURPOSE = "Compares S3 with MetadataStore, and "
++ "returns a failure status if any rules or invariants are violated. "
++ "Only works with DynamoDbMetadataStore.";
+private static final String USAGE = NAME + " [OPTIONS] [s3a://BUCKET]\n" +
+"\t" + PURPOSE + "\n\n" +
+"Common options:\n" +
+"  " + CHECK_FLAG + " Check the metadata store for errors, but do "
 
 Review comment:
   add a - in front of the check flag


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321799314
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return a list of {@link ComparePair}
+   */
+  public List compareS3ToMs(Path p) throws IOException {
+Stopwatch stopwatch = Stopwatch.createStarted();
+int scannedItems = 0;
+
+final Path rootPath = rawFS.qualify(p);
+S3AFileStatus root = null;
+try {
+  root = (S3AFileStatus) rawFS.getFileStatus(rootPath);
+} catch (AWSBadRequestException e) {
+  throw new IOException(e.getMessage());
+}
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  final S3AFileStatus currentDir = queue.poll();
+  scannedItems++;
+
+  final Path currentDirPath = currentDir.getPath();
+  List s3DirListing = 
Arrays.asList(rawFS.listStatus(currentDirPath));
+
+  // DIRECTORIES
+  // Check directory authoritativeness consistency
+  compareAuthoritativeDirectoryFlag(comparePairs, currentDirPath, 
s3DirListing);
+  // Add all descendant directory to the queue
+  s3DirListing.stream().filter(pm -> pm.isDirectory())
+  .map(S3AFileStatus.class::cast)
+  .forEach(pm -> queue.add(pm));
+
+  // FILES
+  // check files for consistency
+  final List children = s3DirListing.stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+   

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316659206
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
 
 Review comment:
should these be final?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316663521
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return
+   */
+  public List compareS3RootToMs(Path p) throws IOException {
+final Path rootPath = rawFS.qualify(p);
+final S3AFileStatus root =
+(S3AFileStatus) rawFS.getFileStatus(rootPath);
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  // pop front node from the queue
+  final S3AFileStatus currentDir = queue.poll();
+
+  // Get a listing of that dir from s3 and add just the files.
+  // (Each directory will be added as a root.)
+  // Files should be casted to S3AFileStatus instead of plain FileStatus
+  // to get the VersionID and Etag.
+  final Path currentDirPath = currentDir.getPath();
+
+  final FileStatus[] s3DirListing = rawFS.listStatus(currentDirPath);
+  final List children =
+  Arrays.asList(s3DirListing).stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+
+  // Compare the directory contents if the listing is authoritative
+  final DirListingMetadata msDirListing =
+  metadataStore.listChildren(currentDirPath);
+  if (msDirListing != null && msDirListing.isAuthoritative()) {
+final ComparePair cP =
+compareAuthDirListing(s3DirListing, msDirListing);
+if (cP.containsViolation()) {
+  comparePairs.add(cP);
+}
+  }
+
+  // Compare directory and contents, but not the listing
+  final

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321812633
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.awaitFileStatus;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Integration tests for the S3Guard Fsck against a dyamodb backed metadata
+ * store.
+ */
+public class ITestS3GuardFsck extends AbstractS3ATestBase {
+
+  private S3AFileSystem guardedFs;
+  private S3AFileSystem rawFS;
+
+  private MetadataStore metadataStore;
+
+  @Before
+  public void setup() throws Exception {
+super.setup();
+S3AFileSystem fs = getFileSystem();
+// These test will fail if no ms
+assertTrue("FS needs to have a metadatastore.",
+fs.hasMetadataStore());
+assertTrue("Metadatastore should persist authoritative bit",
+metadataStorePersistsAuthoritativeBit(fs.getMetadataStore()));
+
+guardedFs = fs;
+metadataStore = fs.getMetadataStore();
+
+// create raw fs without s3guard
+rawFS = createUnguardedFS();
+assertFalse("Raw FS still has S3Guard " + rawFS,
+rawFS.hasMetadataStore());
+  }
+
+  @Override
+  public void teardown() throws Exception {
+if (guardedFs != null) {
+  IOUtils.cleanupWithLogger(LOG, guardedFs);
+}
+IOUtils.cleanupWithLogger(LOG, rawFS);
+super.teardown();
+  }
+
+  /**
+   * Create a test filesystem which is always unguarded.
+   * This filesystem MUST be closed in test teardown.
+   * @return the new FS
+   */
+  private S3AFileSystem createUnguardedFS() throws Exception {
+S3AFileSystem testFS = getFileSystem();
+Configuration config = new Configuration(testFS.getConf());
+URI uri = testFS.getUri();
+
+removeBaseAndBucketOverrides(uri.getHost(), config,
+S3_METADATA_STORE_IMPL);
+removeBaseAndBucketOverrides(uri.getHost(), config,
+METADATASTORE_AUTHORITATIVE);
+S3AFileSystem fs2 = new S3AFileSystem();
+fs2.initialize(uri, config);
+return fs2;
+  }
+
+  @Test
+  public void testIDetectNoMetadataEntry() throws Exception {
+final Path cwd = path("/" + getMethodName() + "-" + UUID.randomUUID());
+final Path file = new Path(cwd, "file");
+try {
+  touch(rawFS, file);
+  awaitFileStatus(rawFS, file);
+
+  final S3GuardFsck s3GuardFsck =
+  new S3GuardFsck(rawFS, metadataStore);
+
+  final List comparePairs =
+  s3GuardFsck.compareS3ToMs(cwd);
+
+  assertEquals("Number of pairs should be two.", 2,
+  comparePairs.size());
+  final S3GuardFsck.ComparePair pair = comparePairs.get(0);
+  assertTrue("The pair must contain a violation.", 
pair.containsViolation());
+  assertEquals("The pair must contain only one violation", 1,
+  pair.getViolations().size());
+
+  final S3GuardFsck.Violation violation =
+  pair.getViolations().iterator().next();
+  assertEquals("The violation should be that there is no violation entry.",
+  violation, S3GuardFsck.Vi

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321703047
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
 ##
 @@ -1485,6 +1486,93 @@ private void vprintln(PrintStream out, String format, 
Object...
 }
   }
 
+  /**
+   * Prune metadata that has not been modified recently.
 
 Review comment:
   javadoc needs updating


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316659790
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
 
 Review comment:
You can use google preconditions


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316663888
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
 
 Review comment:
 Mark as final


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316658728
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
 ##
 @@ -1449,7 +1449,7 @@ public boolean hasMetadataStore() {
* is set for this filesystem.
*/
   @VisibleForTesting
-  boolean hasAuthoritativeMetadataStore() {
+  public boolean hasAuthoritativeMetadataStore() {
 
 Review comment:
   Now this is done on a per path basis is this probe adequate?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316661259
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return
+   */
+  public List compareS3RootToMs(Path p) throws IOException {
+final Path rootPath = rawFS.qualify(p);
+final S3AFileStatus root =
+(S3AFileStatus) rawFS.getFileStatus(rootPath);
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  // pop front node from the queue
+  final S3AFileStatus currentDir = queue.poll();
+
+  // Get a listing of that dir from s3 and add just the files.
+  // (Each directory will be added as a root.)
+  // Files should be casted to S3AFileStatus instead of plain FileStatus
+  // to get the VersionID and Etag.
+  final Path currentDirPath = currentDir.getPath();
+
+  final FileStatus[] s3DirListing = rawFS.listStatus(currentDirPath);
+  final List children =
+  Arrays.asList(s3DirListing).stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+
+  // Compare the directory contents if the listing is authoritative
+  final DirListingMetadata msDirListing =
+  metadataStore.listChildren(currentDirPath);
+  if (msDirListing != null && msDirListing.isAuthoritative()) {
+final ComparePair cP =
+compareAuthDirListing(s3DirListing, msDirListing);
+if (cP.containsViolation()) {
+  comparePairs.add(cP);
+}
+  }
+
+  // Compare directory and contents, but not the listing
+  final

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r321797720
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
 
 Review comment:
   How about "S3A Bucket s3a://bucket-name shoud...", with the bucket name 
included to help people debug things. And avoid S3AFileSystem as that's an 
implementation detail


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316658314
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
 
 Review comment:
   check your import placement.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r31480
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardFsck.java
 ##
 @@ -0,0 +1,707 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+
+import java.net.URI;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.awaitFileStatus;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static 
org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Integration tests for the S3Guard Fsck against a dyamodb backed metadata
+ * store.
+ */
+public class ITestS3GuardFsck extends AbstractS3ATestBase {
+
+  private S3AFileSystem guardedFs;
+  private S3AFileSystem rawFS;
+
+  private MetadataStore metadataStore;
+
+  @Before
+  public void setup() throws Exception {
+super.setup();
+S3AFileSystem fs = getFileSystem();
+// These test will fail if no ms
+assertTrue("FS needs to have a metadatastore.",
 
 Review comment:
   Make this an assume


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316662402
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return
+   */
+  public List compareS3RootToMs(Path p) throws IOException {
+final Path rootPath = rawFS.qualify(p);
+final S3AFileStatus root =
+(S3AFileStatus) rawFS.getFileStatus(rootPath);
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  // pop front node from the queue
+  final S3AFileStatus currentDir = queue.poll();
+
+  // Get a listing of that dir from s3 and add just the files.
+  // (Each directory will be added as a root.)
+  // Files should be casted to S3AFileStatus instead of plain FileStatus
+  // to get the VersionID and Etag.
+  final Path currentDirPath = currentDir.getPath();
+
+  final FileStatus[] s3DirListing = rawFS.listStatus(currentDirPath);
+  final List children =
+  Arrays.asList(s3DirListing).stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+
+  // Compare the directory contents if the listing is authoritative
+  final DirListingMetadata msDirListing =
+  metadataStore.listChildren(currentDirPath);
+  if (msDirListing != null && msDirListing.isAuthoritative()) {
+final ComparePair cP =
+compareAuthDirListing(s3DirListing, msDirListing);
+if (cP.containsViolation()) {
+  comparePairs.add(cP);
+}
+  }
+
+  // Compare directory and contents, but not the listing
+  final

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316658848
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
 
 Review comment:
   Check the import order


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316663381
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return
+   */
+  public List compareS3RootToMs(Path p) throws IOException {
+final Path rootPath = rawFS.qualify(p);
+final S3AFileStatus root =
+(S3AFileStatus) rawFS.getFileStatus(rootPath);
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  // pop front node from the queue
+  final S3AFileStatus currentDir = queue.poll();
+
+  // Get a listing of that dir from s3 and add just the files.
+  // (Each directory will be added as a root.)
+  // Files should be casted to S3AFileStatus instead of plain FileStatus
+  // to get the VersionID and Etag.
+  final Path currentDirPath = currentDir.getPath();
+
+  final FileStatus[] s3DirListing = rawFS.listStatus(currentDirPath);
+  final List children =
+  Arrays.asList(s3DirListing).stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+
+  // Compare the directory contents if the listing is authoritative
+  final DirListingMetadata msDirListing =
+  metadataStore.listChildren(currentDirPath);
+  if (msDirListing != null && msDirListing.isAuthoritative()) {
+final ComparePair cP =
+compareAuthDirListing(s3DirListing, msDirListing);
+if (cP.containsViolation()) {
+  comparePairs.add(cP);
+}
+  }
+
+  // Compare directory and contents, but not the listing
+  final

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316668834
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java
 ##
 @@ -32,6 +32,7 @@
 import com.amazonaws.services.dynamodbv2.model.ResourceInUseException;
 import com.amazonaws.services.dynamodbv2.model.ResourceNotFoundException;
 import com.amazonaws.services.dynamodbv2.model.Tag;
+import org.apache.hadoop.util.ExitUtil;
 
 Review comment:
   Import placement


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org



[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316665134
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsckViolationHandler.java
 ##
 @@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Violation handler for the S3Guard's fsck.
+ */
+public class S3GuardFsckViolationHandler {
+  private static final Logger LOG = LoggerFactory.getLogger(
+  S3GuardFsckViolationHandler.class);
+
+  private S3AFileSystem rawFs;
+  private DynamoDBMetadataStore metadataStore;
+  private static String newLine = System.getProperty("line.separator");
+
+  public S3GuardFsckViolationHandler(S3AFileSystem fs,
+  DynamoDBMetadataStore ddbms) {
+
+this.metadataStore = ddbms;
+this.rawFs = fs;
+  }
+
+  public void handle(S3GuardFsck.ComparePair comparePair) {
+if (!comparePair.containsViolation()) {
+  LOG.debug("There is no violation in the compare pair: " + toString());
+  return;
+}
+
+StringBuilder sB = new StringBuilder();
+sB.append(newLine)
+.append("On path: ").append(comparePair.getPath()).append(newLine);
+
+// Create a new instance of the handler and use it.
+for (S3GuardFsck.Violation violation : comparePair.getViolations()) {
+  try {
+ViolationHandler handler = violation.getHandler()
+.getDeclaredConstructor(S3GuardFsck.ComparePair.class)
+.newInstance(comparePair);
+final String errorStr = handler.getError();
+sB.append(errorStr);
+  } catch (NoSuchMethodException e) {
+LOG.error("Can not find declared constructor for handler: {}",
+violation.getHandler());
+  } catch (IllegalAccessException | InstantiationException | 
InvocationTargetException e) {
+LOG.error("Can not instantiate handler: {}",
+violation.getHandler());
+  }
+  sB.append(newLine);
+}
+LOG.error(sB.toString());
+  }
+
+  /**
+   * Violation handler abstract class.
+   * This class should be extended for violation handlers.
+   */
+  public static abstract class ViolationHandler {
+private final PathMetadata pathMetadata;
+private final S3AFileStatus s3FileStatus;
+private final S3AFileStatus msFileStatus;
+private final List s3DirListing;
+private final DirListingMetadata msDirListing;
+
+public ViolationHandler(S3GuardFsck.ComparePair comparePair) {
+  pathMetadata = comparePair.getMsPathMetadata();
+  s3FileStatus = comparePair.getS3FileStatus();
+  if (pathMetadata != null) {
+msFileStatus = pathMetadata.getFileStatus();
+  } else {
+msFileStatus = null;
+  }
+  s3DirListing = comparePair.getS3DirListing();
+  msDirListing = comparePair.getMsDirListing();
+}
+
+abstract String getError();
+
+public PathMetadata getPathMetadata() {
+  return pathMetadata;
+}
+
+public S3AFileStatus getS3FileStatus() {
+  return s3FileStatus;
+}
+
+public S3AFileStatus getMsFileStatus() {
+  return msFileStatus;
+}
+
+public List getS3DirListing() {
+  return s3DirListing;
+}
+
+public DirListingMetadata getMsDirListing() {
+  return msDirListing;
+}
+  }
+
+  /**
+   * The violation handler when there's no matching metadata entry in the MS.
+   */
+  public static class NoMetadataEntry extends ViolationHandler {
+
+public NoMetadataEntry(S3GuardFsck.ComparePair comparePair) {
+  super(comparePair);
+}
+
+@Override
+public String getError() {
+  return "No PathMetadata for this path in the MS.";
+}
+  }
+
+  /**
+   * The violation handler when there's no parent entry.
+   */
+  public s

[GitHub] [hadoop] steveloughran commented on a change in pull request #1208: HADOOP-16423. S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)

2019-09-06 Thread GitBox
steveloughran commented on a change in pull request #1208: HADOOP-16423. 
S3Guard fsck: Check metadata consistency between S3 and metadatastore (log)
URL: https://github.com/apache/hadoop/pull/1208#discussion_r316660993
 
 

 ##
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardFsck.java
 ##
 @@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+import static java.util.stream.Collectors.toList;
+import static java.util.stream.Collectors.toSet;
+
+/**
+ * Main class for the FSCK factored out from S3GuardTool
+ * The implementation uses fixed DynamoDBMetadataStore as the backing store
+ * for metadata.
+ *
+ * Functions:
+ * 
+ *   Checking metadata consistency between S3 and metadatastore
+ * 
+ */
+public class S3GuardFsck {
+  private static final Logger LOG = LoggerFactory.getLogger(S3GuardFsck.class);
+  public static final String ROOT_PATH_STRING = "/";
+
+  private S3AFileSystem rawFS;
+  private DynamoDBMetadataStore metadataStore;
+
+  /**
+   * Creates an S3GuardFsck.
+   * @param fs the filesystem to compare to
+   * @param ms metadatastore the metadatastore to compare with (dynamo)
+   */
+  S3GuardFsck(S3AFileSystem fs, MetadataStore ms)
+  throws InvalidParameterException {
+this.rawFS = fs;
+
+if (ms == null) {
+  throw new InvalidParameterException("S3AFileSystem should be guarded by"
+  + " a " + DynamoDBMetadataStore.class.getCanonicalName());
+}
+this.metadataStore = (DynamoDBMetadataStore) ms;
+
+if (rawFS.hasMetadataStore()) {
+  throw new InvalidParameterException("Raw fs should not have a "
+  + "metadatastore.");
+}
+  }
+
+  /**
+   * Compares S3 to MS.
+   * Iterative breadth first walk on the S3 structure from a given root.
+   * Creates a list of pairs (metadata in S3 and in the MetadataStore) where
+   * the consistency or any rule is violated.
+   * Uses {@link S3GuardFsckViolationHandler} to handle violations.
+   * The violations are listed in Enums: {@link Violation}
+   *
+   * @param p the root path to start the traversal
+   * @throws IOException
+   * @return
+   */
+  public List compareS3RootToMs(Path p) throws IOException {
+final Path rootPath = rawFS.qualify(p);
+final S3AFileStatus root =
+(S3AFileStatus) rawFS.getFileStatus(rootPath);
+final List comparePairs = new ArrayList<>();
+final Queue queue = new ArrayDeque<>();
+queue.add(root);
+
+while (!queue.isEmpty()) {
+  // pop front node from the queue
+  final S3AFileStatus currentDir = queue.poll();
+
+  // Get a listing of that dir from s3 and add just the files.
+  // (Each directory will be added as a root.)
+  // Files should be casted to S3AFileStatus instead of plain FileStatus
+  // to get the VersionID and Etag.
+  final Path currentDirPath = currentDir.getPath();
+
+  final FileStatus[] s3DirListing = rawFS.listStatus(currentDirPath);
+  final List children =
+  Arrays.asList(s3DirListing).stream()
+  .filter(status -> !status.isDirectory())
+  .map(S3AFileStatus.class::cast).collect(toList());
+
+  // Compare the directory contents if the listing is authoritative
+  final DirListingMetadata msDirListing =
+  metadataStore.listChildren(currentDirPath);
+  if (msDirListing != null && msDirListing.isAuthoritative()) {
+final ComparePair cP =
+compareAuthDirListing(s3DirListing, msDirListing);
+if (cP.containsViolation()) {
+  comparePairs.add(cP);
+}
+  }
+
+  // Compare directory and contents, but not the listing
+  final