keith-turner commented on code in PR #5348:
URL: https://github.com/apache/accumulo/pull/5348#discussion_r1970687570


##########
server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java:
##########
@@ -18,22 +18,230 @@
  */
 package org.apache.accumulo.server.util.checkCommand;
 
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.accumulo.core.Constants;
+import org.apache.accumulo.core.lock.ServiceLock;
+import org.apache.accumulo.core.metadata.AccumuloTable;
+import org.apache.accumulo.core.metadata.TServerInstance;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata;
+import org.apache.accumulo.core.zookeeper.ZooSession;
 import org.apache.accumulo.server.ServerContext;
 import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.log.WalStateManager;
 import org.apache.accumulo.server.util.Admin;
 
 public class SystemConfigCheckRunner implements CheckRunner {
   private static final Admin.CheckCommand.Check check = 
Admin.CheckCommand.Check.SYSTEM_CONFIG;
 
+  public enum ServerProcess {
+    MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, 
SCAN_SERVER
+  }
+
   @Override
   public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, 
ServerUtilOpts opts,
       boolean fixFiles) throws Exception {
     Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
     printRunning();
+
+    log.trace("********** Checking validity of some ZooKeeper nodes 
**********");
+    status = checkZkNodes(context, status);
+
     printCompleted(status);
     return status;
   }
 
+  private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext 
context,
+      Admin.CheckCommand.CheckStatus status) throws Exception {
+    status = checkZKLocks(context, status);
+    status = checkZKTableNodes(context, status);
+    status = checkZKWALsMetadata(context, status);
+
+    return status;
+  }
+
+  private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext 
context,
+      Admin.CheckCommand.CheckStatus status) throws Exception {
+    final ServerProcess[] serverProcesses = ServerProcess.values();
+    final String zkRoot = context.getZooKeeperRoot();
+    final var zs = context.getZooSession();
+    final var zrw = zs.asReaderWriter();
+    final var compactors = context.instanceOperations().getCompactors();
+    final var sservers = context.instanceOperations().getScanServers();
+
+    log.trace("Checking ZooKeeper locks for Accumulo server processes...");
+
+    // check that essential server processes have a ZK lock failing otherwise
+    // check that nonessential server processes have a ZK lock only if they 
are running. If they are
+    // not running, alerts the user that the process is not running which may 
or may not be expected
+    for (ServerProcess proc : serverProcesses) {
+      log.trace("Looking for {} lock(s)...", proc);
+      switch (proc) {
+        case MANAGER:
+          // essential process
+          status = checkLock(zkRoot + Constants.ZMANAGER_LOCK, proc, true, zs, 
status);
+          break;
+        case GC:
+          // essential process
+          status = checkLock(zkRoot + Constants.ZGC_LOCK, proc, true, zs, 
status);
+          break;
+        case TSERVER:
+          // essential process(es)
+          final var tservers = TabletMetadata.getLiveTServers(context);
+          if (tservers.isEmpty()) {
+            log.warn("Did not find any running tablet servers!");
+            status = Admin.CheckCommand.CheckStatus.FAILED;
+          }
+          break;
+        case COMPACTION_COORDINATOR:
+          // nonessential process
+          status = checkLock(zkRoot + Constants.ZCOORDINATOR_LOCK, proc, 
false, zs, status);
+          break;
+        case COMPACTOR:
+          // nonessential process(es)
+          if (compactors.isEmpty()) {
+            log.debug("No compactors appear to be running... This may or may 
not be expected");
+          }
+          for (String compactor : compactors) {
+            // for each running compactor, ensure a zk lock exists for it
+            boolean checkedLock = false;
+            String compactorQueuesPath = zkRoot + Constants.ZCOMPACTORS;
+            var compactorQueues = zrw.getChildren(compactorQueuesPath);
+            // find the queue the compactor is in
+            for (var queue : compactorQueues) {
+              String compactorQueuePath = compactorQueuesPath + "/" + queue;
+              String lockPath = compactorQueuePath + "/" + compactor;
+              if (zrw.exists(lockPath)) {
+                status = checkLock(lockPath, proc, true, zs, status);
+                checkedLock = true;
+                break;
+              }
+            }
+            if (!checkedLock) {
+              log.warn("Did not find a ZooKeeper lock for the compactor {}!", 
compactor);
+              status = Admin.CheckCommand.CheckStatus.FAILED;
+            }
+          }
+          break;
+        case MONITOR:
+          // nonessential process
+          status = checkLock(zkRoot + Constants.ZMONITOR_LOCK, proc, false, 
zs, status);
+          break;
+        case SCAN_SERVER:
+          // nonessential process(es)
+          if (sservers.isEmpty()) {
+            log.debug("No scan servers appear to be running... This may or may 
not be expected");
+          }
+          for (String sserver : sservers) {
+            status =
+                checkLock(zkRoot + Constants.ZSSERVERS + "/" + sserver, proc, 
true, zs, status);
+          }
+          break;
+        default:
+          throw new IllegalStateException("Unhandled case: " + proc);
+      }
+    }

Review Comment:
   Yeah in 4.0 compactors are essential, that is the only way compaction can 
happen in 4.0.  Scan servers are still optional in 4.0.  The compaction 
coordinator is merged into the manager in 4.0.



##########
server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/ServerConfigCheckRunner.java:
##########
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.server.util.checkCommand;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.accumulo.core.conf.Property;
+import org.apache.accumulo.server.ServerContext;
+import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.util.Admin;
+
+public class ServerConfigCheckRunner implements CheckRunner {
+  private static final Admin.CheckCommand.Check check = 
Admin.CheckCommand.Check.SERVER_CONFIG;
+
+  @Override
+  public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, 
ServerUtilOpts opts,
+      boolean fixFiles) throws Exception {
+    Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
+    printRunning();
+
+    log.trace("********** Checking server configuration **********");
+
+    log.trace("Checking that all configured properties are valid (valid key 
and value)");
+    final Map<String,String> definedProps = new HashMap<>();
+    final var config = context.getConfiguration();
+    config.getProperties(definedProps, s -> true);
+    for (var entry : definedProps.entrySet()) {
+      var key = entry.getKey();
+      var val = entry.getValue();
+      if (!Property.isValidProperty(key, val)) {
+        log.warn("Invalid property (key={} val={}) found in the config", key, 
val);
+        status = Admin.CheckCommand.CheckStatus.FAILED;
+      }
+    }
+
+    log.trace("Checking that all required config properties are present");
+    // there are many properties that should be set (default value or user 
set), identifying them
+    // all and checking them here is unrealistic. Some property that is not 
set but is expected
+    // will likely result in some sort of failure eventually anyway. We will 
just check a few
+    // obvious required properties here.
+    Set<Property> requiredProps = Set.of(Property.INSTANCE_ZK_HOST, 
Property.INSTANCE_ZK_TIMEOUT,
+        Property.INSTANCE_SECRET, Property.INSTANCE_VOLUMES, 
Property.GENERAL_THREADPOOL_SIZE,
+        Property.GENERAL_DELEGATION_TOKEN_LIFETIME,
+        Property.GENERAL_DELEGATION_TOKEN_UPDATE_INTERVAL, 
Property.GENERAL_IDLE_PROCESS_INTERVAL,
+        Property.GENERAL_LOW_MEM_DETECTOR_INTERVAL, 
Property.GENERAL_LOW_MEM_DETECTOR_THRESHOLD,
+        Property.GENERAL_PROCESS_BIND_ADDRESS, 
Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL,
+        Property.MANAGER_CLIENTPORT, Property.TSERV_CLIENTPORT, 
Property.GC_CYCLE_START,
+        Property.GC_CYCLE_DELAY, Property.GC_PORT, Property.MONITOR_PORT, 
Property.TABLE_MAJC_RATIO,
+        Property.TABLE_SPLIT_THRESHOLD);

Review Comment:
   Not a change for this PR, but wondering if this should be pushed to the 
validation code of each property. For example could we attempt to do something 
like the following in addition to the code that goes through the defined props 
above.  Thinking if a props validation fails on null or empty string then its 
"required" and should be set.  Looking at some of the important props, like 
`instance.volumes` their types would need change from something besides STRING 
that is more specific, which would be a good general change to make (would be 
good to have specific type to do validation for instance volumes and that could 
include not accepting empty string).
   
   ```java
       for(var prop : Property.values()) {
         var value = config.get(prop);
         if (!Property.isValidProperty(prop.getKey(), value)) {
           log.warn("Invalid property (key={} val={}) found in the config", 
prop, value);
         }
       }
   ```
   
   If the rest of the code worked like this, then would not need this list here.



##########
server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/RootTableCheckRunner.java:
##########
@@ -51,8 +51,7 @@ public TableId tableId() {
   public Set<ColumnFQ> requiredColFQs() {
     return 
Set.of(MetadataSchema.TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN,
         MetadataSchema.TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN,
-        MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN,
-        MetadataSchema.TabletsSection.ServerColumnFamily.LOCK_COLUMN);
+        MetadataSchema.TabletsSection.ServerColumnFamily.TIME_COLUMN);
   }

Review Comment:
   Yeah those look good.



##########
server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java:
##########
@@ -18,22 +18,230 @@
  */
 package org.apache.accumulo.server.util.checkCommand;
 
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.accumulo.core.Constants;
+import org.apache.accumulo.core.lock.ServiceLock;
+import org.apache.accumulo.core.metadata.AccumuloTable;
+import org.apache.accumulo.core.metadata.TServerInstance;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata;
+import org.apache.accumulo.core.zookeeper.ZooSession;
 import org.apache.accumulo.server.ServerContext;
 import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.log.WalStateManager;
 import org.apache.accumulo.server.util.Admin;
 
 public class SystemConfigCheckRunner implements CheckRunner {
   private static final Admin.CheckCommand.Check check = 
Admin.CheckCommand.Check.SYSTEM_CONFIG;
 
+  public enum ServerProcess {
+    MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, 
SCAN_SERVER
+  }

Review Comment:
   In 4.0 have this type
   
   
https://github.com/apache/accumulo/blob/121758a40e35df5589e53d9cfecdd95f7705cc56/core/src/main/java/org/apache/accumulo/core/client/admin/servers/ServerId.java#L39



##########
server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/ServerConfigCheckRunner.java:
##########
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.server.util.checkCommand;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.accumulo.core.conf.Property;
+import org.apache.accumulo.server.ServerContext;
+import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.util.Admin;
+
+public class ServerConfigCheckRunner implements CheckRunner {
+  private static final Admin.CheckCommand.Check check = 
Admin.CheckCommand.Check.SERVER_CONFIG;
+
+  @Override
+  public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, 
ServerUtilOpts opts,
+      boolean fixFiles) throws Exception {
+    Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
+    printRunning();
+
+    log.trace("********** Checking server configuration **********");
+
+    log.trace("Checking that all configured properties are valid (valid key 
and value)");
+    final Map<String,String> definedProps = new HashMap<>();
+    final var config = context.getConfiguration();
+    config.getProperties(definedProps, s -> true);
+    for (var entry : definedProps.entrySet()) {
+      var key = entry.getKey();
+      var val = entry.getValue();
+      if (!Property.isValidProperty(key, val)) {
+        log.warn("Invalid property (key={} val={}) found in the config", key, 
val);
+        status = Admin.CheckCommand.CheckStatus.FAILED;
+      }
+    }
+
+    log.trace("Checking that all required config properties are present");
+    // there are many properties that should be set (default value or user 
set), identifying them
+    // all and checking them here is unrealistic. Some property that is not 
set but is expected
+    // will likely result in some sort of failure eventually anyway. We will 
just check a few
+    // obvious required properties here.
+    Set<Property> requiredProps = Set.of(Property.INSTANCE_ZK_HOST, 
Property.INSTANCE_ZK_TIMEOUT,
+        Property.INSTANCE_SECRET, Property.INSTANCE_VOLUMES, 
Property.GENERAL_THREADPOOL_SIZE,
+        Property.GENERAL_DELEGATION_TOKEN_LIFETIME,
+        Property.GENERAL_DELEGATION_TOKEN_UPDATE_INTERVAL, 
Property.GENERAL_IDLE_PROCESS_INTERVAL,
+        Property.GENERAL_LOW_MEM_DETECTOR_INTERVAL, 
Property.GENERAL_LOW_MEM_DETECTOR_THRESHOLD,
+        Property.GENERAL_PROCESS_BIND_ADDRESS, 
Property.GENERAL_SERVER_LOCK_VERIFICATION_INTERVAL,
+        Property.MANAGER_CLIENTPORT, Property.TSERV_CLIENTPORT, 
Property.GC_CYCLE_START,
+        Property.GC_CYCLE_DELAY, Property.GC_PORT, Property.MONITOR_PORT, 
Property.TABLE_MAJC_RATIO,
+        Property.TABLE_SPLIT_THRESHOLD);

Review Comment:
    As the code is [currently 
written](https://github.com/apache/accumulo/blob/9e54d62244965d56994d4c74a6005aa0d7027411/core/src/main/java/org/apache/accumulo/core/conf/SiteConfiguration.java#L238-L248)
 it loops over all the props the loop in the prev comment may not work well 
because the get method replaces w/ the default value when not present.
   
   In general it seems like it would be best to move the concept of a required 
property into the Property class in some form.  Then the entire system could 
react appropriately when a required property is not present and is requested.  
For now a list in this class seems fine.
   
   I experimented w/ validating the volume prop in #5365  based on the 
exploration done as part of this.
   



##########
server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java:
##########
@@ -18,22 +18,230 @@
  */
 package org.apache.accumulo.server.util.checkCommand;
 
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.accumulo.core.Constants;
+import org.apache.accumulo.core.lock.ServiceLock;
+import org.apache.accumulo.core.metadata.AccumuloTable;
+import org.apache.accumulo.core.metadata.TServerInstance;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata;
+import org.apache.accumulo.core.zookeeper.ZooSession;
 import org.apache.accumulo.server.ServerContext;
 import org.apache.accumulo.server.cli.ServerUtilOpts;
+import org.apache.accumulo.server.log.WalStateManager;
 import org.apache.accumulo.server.util.Admin;
 
 public class SystemConfigCheckRunner implements CheckRunner {
   private static final Admin.CheckCommand.Check check = 
Admin.CheckCommand.Check.SYSTEM_CONFIG;
 
+  public enum ServerProcess {
+    MANAGER, GC, TSERVER, COMPACTION_COORDINATOR, COMPACTOR, MONITOR, 
SCAN_SERVER
+  }
+
   @Override
   public Admin.CheckCommand.CheckStatus runCheck(ServerContext context, 
ServerUtilOpts opts,
       boolean fixFiles) throws Exception {
     Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
     printRunning();
+
+    log.trace("********** Checking validity of some ZooKeeper nodes 
**********");
+    status = checkZkNodes(context, status);
+
     printCompleted(status);
     return status;
   }
 
+  private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext 
context,
+      Admin.CheckCommand.CheckStatus status) throws Exception {
+    status = checkZKLocks(context, status);
+    status = checkZKTableNodes(context, status);
+    status = checkZKWALsMetadata(context, status);
+
+    return status;
+  }
+
+  private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext 
context,
+      Admin.CheckCommand.CheckStatus status) throws Exception {
+    final ServerProcess[] serverProcesses = ServerProcess.values();
+    final String zkRoot = context.getZooKeeperRoot();
+    final var zs = context.getZooSession();
+    final var zrw = zs.asReaderWriter();
+    final var compactors = context.instanceOperations().getCompactors();
+    final var sservers = context.instanceOperations().getScanServers();
+
+    log.trace("Checking ZooKeeper locks for Accumulo server processes...");
+
+    // check that essential server processes have a ZK lock failing otherwise
+    // check that nonessential server processes have a ZK lock only if they 
are running. If they are
+    // not running, alerts the user that the process is not running which may 
or may not be expected
+    for (ServerProcess proc : serverProcesses) {
+      log.trace("Looking for {} lock(s)...", proc);
+      switch (proc) {
+        case MANAGER:
+          // essential process
+          status = checkLock(zkRoot + Constants.ZMANAGER_LOCK, proc, true, zs, 
status);
+          break;
+        case GC:
+          // essential process
+          status = checkLock(zkRoot + Constants.ZGC_LOCK, proc, true, zs, 
status);
+          break;
+        case TSERVER:
+          // essential process(es)
+          final var tservers = TabletMetadata.getLiveTServers(context);
+          if (tservers.isEmpty()) {
+            log.warn("Did not find any running tablet servers!");
+            status = Admin.CheckCommand.CheckStatus.FAILED;
+          }
+          break;
+        case COMPACTION_COORDINATOR:
+          // nonessential process
+          status = checkLock(zkRoot + Constants.ZCOORDINATOR_LOCK, proc, 
false, zs, status);
+          break;
+        case COMPACTOR:
+          // nonessential process(es)
+          if (compactors.isEmpty()) {
+            log.debug("No compactors appear to be running... This may or may 
not be expected");
+          }
+          for (String compactor : compactors) {
+            // for each running compactor, ensure a zk lock exists for it
+            boolean checkedLock = false;
+            String compactorQueuesPath = zkRoot + Constants.ZCOMPACTORS;
+            var compactorQueues = zrw.getChildren(compactorQueuesPath);
+            // find the queue the compactor is in
+            for (var queue : compactorQueues) {
+              String compactorQueuePath = compactorQueuesPath + "/" + queue;
+              String lockPath = compactorQueuePath + "/" + compactor;
+              if (zrw.exists(lockPath)) {
+                status = checkLock(lockPath, proc, true, zs, status);
+                checkedLock = true;
+                break;
+              }
+            }
+            if (!checkedLock) {
+              log.warn("Did not find a ZooKeeper lock for the compactor {}!", 
compactor);
+              status = Admin.CheckCommand.CheckStatus.FAILED;
+            }
+          }
+          break;
+        case MONITOR:
+          // nonessential process
+          status = checkLock(zkRoot + Constants.ZMONITOR_LOCK, proc, false, 
zs, status);
+          break;
+        case SCAN_SERVER:
+          // nonessential process(es)
+          if (sservers.isEmpty()) {
+            log.debug("No scan servers appear to be running... This may or may 
not be expected");
+          }
+          for (String sserver : sservers) {
+            status =
+                checkLock(zkRoot + Constants.ZSSERVERS + "/" + sserver, proc, 
true, zs, status);
+          }
+          break;
+        default:
+          throw new IllegalStateException("Unhandled case: " + proc);
+      }
+    }
+
+    return status;
+  }
+
+  private static Admin.CheckCommand.CheckStatus checkLock(String path, 
ServerProcess proc,
+      boolean requiredProc, ZooSession zs, Admin.CheckCommand.CheckStatus 
status) throws Exception {
+    log.trace("Checking ZooKeeper lock at path {}", path);
+
+    ServiceLock.ServiceLockPath slp = ServiceLock.path(path);
+    var opData = ServiceLock.getLockData(zs, slp);
+    if (requiredProc && opData.isEmpty()) {
+      log.warn("No ZooKeeper lock found for {} at {}! The process may not be 
running.", proc, path);
+      status = Admin.CheckCommand.CheckStatus.FAILED;
+    } else if (!requiredProc && opData.isEmpty()) {
+      log.debug("No ZooKeeper lock found for {} at {}. The process may not be 
running. "
+          + "This may or may not be expected.", proc, path);
+    }
+    return status;
+  }
+
+  private static Admin.CheckCommand.CheckStatus 
checkZKTableNodes(ServerContext context,
+      Admin.CheckCommand.CheckStatus status) throws Exception {
+    log.trace("Checking ZooKeeper table nodes...");
+
+    final var zrw = context.getZooSession().asReaderWriter();
+    final var zkRoot = context.getZooKeeperRoot();
+    final var tableNameToId = context.tableOperations().tableIdMap();
+    final Map<String,String> systemTableNameToId = new HashMap<>();
+    for (var accumuloTable : AccumuloTable.values()) {
+      systemTableNameToId.put(accumuloTable.tableName(), 
accumuloTable.tableId().canonical());
+    }
+
+    // ensure all system tables exist
+    if (!tableNameToId.values().containsAll(systemTableNameToId.values())) {
+      log.warn(
+          "Missing essential Accumulo table. One or more of {} are missing 
from the tables found {}",
+          systemTableNameToId, tableNameToId);
+      status = Admin.CheckCommand.CheckStatus.FAILED;
+    }
+    for (var nameToId : tableNameToId.entrySet()) {
+      var tablePath = zkRoot + Constants.ZTABLES + "/" + nameToId.getValue();
+      // expect the table path to exist and some data to exist
+      if (!zrw.exists(tablePath) || zrw.getChildren(tablePath).isEmpty()) {
+        log.warn("Failed to find table ({}) info at expected path {}", 
nameToId, tablePath);
+        status = Admin.CheckCommand.CheckStatus.FAILED;
+      }
+    }
+
+    return status;
+  }
+
+  private static Admin.CheckCommand.CheckStatus 
checkZKWALsMetadata(ServerContext context,
+      Admin.CheckCommand.CheckStatus status) throws Exception {
+    final String zkRoot = context.getZooKeeperRoot();
+    final var zs = context.getZooSession();
+    final var zrw = zs.asReaderWriter();
+    final var rootWalsDir = zkRoot + WalStateManager.ZWALS;
+    final Set<TServerInstance> tserverInstances = 
TabletMetadata.getLiveTServers(context);
+    final Set<TServerInstance> seenTServerInstancesAtWals = new HashSet<>();
+
+    log.trace("Checking that WAL metadata in ZooKeeper is valid...");
+
+    // each child node of the root wals dir should be a 
TServerInstance.toString()
+    var tserverInstancesAtWals = zrw.getChildren(rootWalsDir);
+    for (var tserverInstanceAtWals : tserverInstancesAtWals) {
+      final TServerInstance tsi = new TServerInstance(tserverInstanceAtWals);
+      seenTServerInstancesAtWals.add(tsi);
+      final var tserverPath = rootWalsDir + "/" + tserverInstanceAtWals;
+      // each child node of the tserver should be WAL metadata
+      final var wals = zrw.getChildren(tserverPath);
+      if (wals.isEmpty()) {
+        log.debug("No WAL metadata found for tserver {}", tsi);
+      }
+      for (var wal : wals) {
+        // should be able to parse the WAL metadata
+        final var fullWalPath = tserverPath + "/" + wal;
+        log.trace("Attempting to parse WAL metadata at {}", fullWalPath);
+        var parseRes = WalStateManager.parse(zrw.getData(fullWalPath));
+        log.trace("Successfully parsed WAL metadata at {} result {}", 
fullWalPath, parseRes);
+        log.trace("Checking if the WAL path {} found in the metadata exists in 
HDFS...",
+            parseRes.getSecond());
+        if (!context.getVolumeManager().exists(parseRes.getSecond())) {
+          log.warn("WAL metadata for tserver {} references a WAL that does not 
exist",
+              tserverInstanceAtWals);
+          status = Admin.CheckCommand.CheckStatus.FAILED;
+        }
+      }
+    }
+    if (!tserverInstances.equals(seenTServerInstancesAtWals)) {
+      log.warn(
+          "Expected WAL metadata in ZooKeeper for all tservers. tservers={} 
tservers seen storing WAL metadata={}",
+          tserverInstances, seenTServerInstancesAtWals);
+      status = Admin.CheckCommand.CheckStatus.FAILED;
+    }

Review Comment:
   Would expect tablet servers to always have a WAL, even if nothing is 
written.  There is a time period during tserver startup where it has not yet 
created its first WAL after it has create its lock in ZK.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to