kevinrr888 commented on code in PR #5348:
URL: https://github.com/apache/accumulo/pull/5348#discussion_r2737883751
##########
server/base/src/main/java/org/apache/accumulo/server/util/checkCommand/SystemConfigCheckRunner.java:
##########
@@ -30,10 +41,167 @@ public Admin.CheckCommand.CheckStatus
runCheck(ServerContext context, ServerUtil
boolean fixFiles) throws Exception {
Admin.CheckCommand.CheckStatus status = Admin.CheckCommand.CheckStatus.OK;
printRunning();
+
+ log.trace("********** Checking validity of some ZooKeeper nodes
**********");
+ status = checkZkNodes(context, status);
+
printCompleted(status);
return status;
}
+ private static Admin.CheckCommand.CheckStatus checkZkNodes(ServerContext
context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ status = checkZKLocks(context, status);
+ status = checkZKTableNodes(context, status);
+ status = checkZKWALsMetadata(context, status);
+
+ return status;
+ }
+
+ private static Admin.CheckCommand.CheckStatus checkZKLocks(ServerContext
context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ final ServerId.Type[] serverTypes = ServerId.Type.values();
+
+ log.trace("Checking ZooKeeper locks for Accumulo server processes...");
+
+ // check that essential server processes have a ZK lock failing otherwise
+ // check that nonessential server processes have a ZK lock only if they
are running. If they are
+ // not running, alerts the user that the process is not running which may
or may not be expected
+ for (ServerId.Type serverType : serverTypes) {
+ log.trace("Looking for {} lock(s)...", serverType);
+ var servers = context.instanceOperations().getServers(serverType);
+
+ switch (serverType) {
+ case MANAGER:
+ // essential process
+ case GARBAGE_COLLECTOR:
+ // essential process
+ if (servers.size() != 1) {
+ log.warn("Expected 1 server to be found for {} but found {}",
serverType,
+ servers.size());
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ } else {
+ // no exception and 1 server found
+ log.trace("Verified ZooKeeper lock for {}", servers);
+ }
+ break;
+ case MONITOR:
+ // nonessential process
+ if (servers.isEmpty()) {
+ log.debug("No {} appears to be running. This may or may not be
expected", serverType);
+ } else if (servers.size() > 1) {
+ log.warn("More than 1 {} was found running. This is not expected",
serverType);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ } else {
+ // no exception and 1 server found
+ log.trace("Verified ZooKeeper lock for {}", servers);
+ }
+ break;
+ case TABLET_SERVER:
+ // essential process(es)
+ case COMPACTOR:
+ // essential process(es)
+ if (servers.isEmpty()) {
+ log.warn("No {} appear to be running. This is not expected.",
serverType);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ } else {
+ // no exception and >= 1 server found
+ log.trace("Verified ZooKeeper lock(s) for {}", servers);
+ }
+ break;
+ case SCAN_SERVER:
+ // nonessential process(es)
+ if (servers.isEmpty()) {
+ log.debug("No {} appear to be running. This may or may not be
expected.", serverType);
+ } else {
+ // no exception and >= 1 server found
+ log.trace("Verified ZooKeeper lock(s) for {}", servers);
+ }
+ break;
+ default:
+ throw new IllegalStateException("Unhandled case: " + serverType);
+ }
+ }
+
+ return status;
+ }
+
+ private static Admin.CheckCommand.CheckStatus
checkZKTableNodes(ServerContext context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ log.trace("Checking ZooKeeper table nodes...");
+
+ final var zrw = context.getZooSession().asReaderWriter();
+ final var tableNameToId = context.tableOperations().tableIdMap();
+ final Map<String,String> systemTableNameToId = new HashMap<>();
+ for (var accumuloTable : SystemTables.values()) {
+ systemTableNameToId.put(accumuloTable.tableName(),
accumuloTable.tableId().canonical());
+ }
+
+ // ensure all system tables exist
+ if (!tableNameToId.values().containsAll(systemTableNameToId.values())) {
+ log.warn(
+ "Missing essential Accumulo table. One or more of {} are missing
from the tables found {}",
+ systemTableNameToId, tableNameToId);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ }
+ for (var nameToId : tableNameToId.entrySet()) {
+ var tablePath = Constants.ZTABLES + "/" + nameToId.getValue();
+ // expect the table path to exist and some data to exist
+ if (!zrw.exists(tablePath) || zrw.getChildren(tablePath).isEmpty()) {
+ log.warn("Failed to find table ({}) info at expected path {}",
nameToId, tablePath);
+ status = Admin.CheckCommand.CheckStatus.FAILED;
+ }
+ }
+
+ return status;
+ }
+
+ private static Admin.CheckCommand.CheckStatus
checkZKWALsMetadata(ServerContext context,
+ Admin.CheckCommand.CheckStatus status) throws Exception {
+ final var zs = context.getZooSession();
+ final var zrw = zs.asReaderWriter();
+ final var rootWalsDir = WalStateManager.ZWALS;
+ final Set<TServerInstance> tserverInstances =
TabletMetadata.getLiveTServers(context);
+ final Set<TServerInstance> seenTServerInstancesAtWals = new HashSet<>();
+
+ log.trace("Checking that WAL metadata in ZooKeeper is valid...");
+
+ // each child node of the root wals dir should be a
TServerInstance.toString()
+ var tserverInstancesAtWals = zrw.getChildren(rootWalsDir);
+ for (var tserverInstanceAtWals : tserverInstancesAtWals) {
+ final TServerInstance tsi = new TServerInstance(tserverInstanceAtWals);
+ seenTServerInstancesAtWals.add(tsi);
+ final var tserverPath = rootWalsDir + "/" + tserverInstanceAtWals;
+ // each child node of the tserver should be WAL metadata
+ final var wals = zrw.getChildren(tserverPath);
+ if (wals.isEmpty()) {
+ log.debug("No WAL metadata found for tserver {}", tsi);
Review Comment:
Made this a WARN, added comment, and improved log message in
1af0ad37e051d4f6c770110e711fc0d122cb832f
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]