huaxiangsun commented on a change in pull request #2584: URL: https://github.com/apache/hbase/pull/2584#discussion_r515749525
########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java ########## @@ -196,8 +202,43 @@ private boolean tryComplete(LocateRequest req, CompletableFuture<RegionLocations MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE, DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE); this.locatePrefetchLimit = conn.getConfiguration().getInt(LOCATE_PREFETCH_LIMIT, DEFAULT_LOCATE_PREFETCH_LIMIT); - this.useMetaReplicas = - conn.getConfiguration().getBoolean(USE_META_REPLICAS, DEFAULT_USE_META_REPLICAS); + + // Get the region locator's meta replica mode. + this.metaReplicaMode = CatalogReplicaMode.valueOf(conn.getConfiguration() + .get(LOCATOR_META_REPLICAS_MODE, CatalogReplicaMode.None.toString())); + + switch (this.metaReplicaMode) { + case LoadBalance: + String replicaSelectorClass = conn.getConfiguration(). + get(RegionLocator.LOCATOR_META_REPLICAS_MODE_LOADBALANCE_SELECTOR, + CatalogReplicaLoadBalanceSimpleSelector.class.getName()); + + this.metaReplicaSelector = CatalogReplicaLoadBalanceSelectorFactory.createSelector( + replicaSelectorClass, META_TABLE_NAME, conn, () -> { + int numOfReplicas = 1; + try { + RegionLocations metaLocations = conn.registry.getMetaRegionLocations().get( + GET_META_LOCATIONS_TIMEOUT, TimeUnit.MILLISECONDS); + numOfReplicas = metaLocations.size(); + } catch (Exception e) { + LOG.error("Failed to get table {}'s region replication, ", META_TABLE_NAME, e); + } + return numOfReplicas; + }); + break; + case None: + // If user does not configure LOCATOR_META_REPLICAS_MODE, let's check the legacy config. + if (this.metaReplicaMode == CatalogReplicaMode.None) { + boolean useMetaReplicas = conn.getConfiguration().getBoolean(USE_META_REPLICAS, + DEFAULT_USE_META_REPLICAS); + if (useMetaReplicas) { + this.metaReplicaMode = CatalogReplicaMode.HedgedRead; + } + } + break; + default: + // Doing nothing Review comment: There will be no exception as metaReplica is already an enum here. The default branch is added to avoid complaining from hadoop build. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/RegionLocations.java ########## @@ -38,6 +38,9 @@ private final int numNonNullElements; + // For Meta Replica LoadBalance mode, this indicates which meta replica the locations come from. + private int fromMetaReplicaId; Review comment: Ok, going to remove the replica id for now. We can go back to it later when it is needed. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java ########## @@ -82,14 +84,18 @@ static String LOCATE_PREFETCH_LIMIT = "hbase.client.locate.prefetch.limit"; private static final int DEFAULT_LOCATE_PREFETCH_LIMIT = 10; + private static final long GET_META_LOCATIONS_TIMEOUT = 2000; // 2 seconds Review comment: This is the timeout value to wait for meta locations from registry. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncConnectionImpl.java ########## @@ -182,8 +183,21 @@ public void newDead(ServerName sn) { } private void spawnRenewalChore(final UserGroupInformation user) { - authService = new ChoreService("Relogin service"); - authService.scheduleChore(AuthUtil.getAuthRenewalChore(user)); + ChoreService service = getChoreService(); + service.scheduleChore(AuthUtil.getAuthRenewalChore(user)); + } + + /** + * If choreService has not been created yet, create the ChoreService. + * @return ChoreService + */ + ChoreService getChoreService() { + synchronized (this) { + if (choreService == null) { + choreService = new ChoreService("AsyncConn Chore Service"); + } + } + return choreService; Review comment: Done with the first approach. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java ########## @@ -0,0 +1,302 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.client; + +import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow; +import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR; +import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent; +import java.util.Iterator; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ConcurrentNavigableMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.IntSupplier; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.hadoop.hbase.HRegionLocation; +import org.apache.hadoop.hbase.ScheduledChore; +import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * <p>CatalogReplicaLoadBalanceReplicaSimpleSelector implements a simple catalog replica load balancing + * algorithm. It maintains a stale location cache for each table. Whenever client looks up location, + * it first check if the row is the stale location cache. If yes, the location from + * catalog replica is stale, it will go to the primary region to look up update-to-date location; + * otherwise, it will randomly pick up a replica region for lookup. When clients receive + * RegionNotServedException from region servers, it will add these region locations to the stale + * location cache. The stale cache will be cleaned up periodically by a chore.</p> + * + * It follows a simple algorithm to choose a replica to go: + * + * <ol> + * <li>If there is no stale location entry for rows it looks up, it will randomly + * pick a replica region to do lookup. </li> + * <li>If the location from the replica region is stale, client gets RegionNotServedException + * from region server, in this case, it will create StaleLocationCacheEntry in + * CatalogReplicaLoadBalanceReplicaSimpleSelector.</li> + * <li>When client tries to do location lookup, it checks StaleLocationCache first for rows it + * tries to lookup, if entry exists, it will go with primary meta region to do lookup; + * otherwise, it will follow step 1.</li> + * <li>A chore will periodically run to clean up cache entries in the StaleLocationCache.</li> + * </ol> + */ +class CatalogReplicaLoadBalanceSimpleSelector implements + CatalogReplicaLoadBalanceSelector, Stoppable { + private static final Logger LOG = + LoggerFactory.getLogger(CatalogReplicaLoadBalanceSimpleSelector.class); + private final long STALE_CACHE_TIMEOUT_IN_MILLISECONDS = 3000; // 3 seconds + private final int STALE_CACHE_CLEAN_CHORE_INTERVAL_IN_MILLISECONDS = 1500; // 1.5 seconds + private final int REFRESH_REPLICA_COUNT_CHORE_INTERVAL_IN_MILLISECONDS = 60000; // 1 minute + + /** + * StaleLocationCacheEntry is the entry when a stale location is reported by an client. + */ + private static final class StaleLocationCacheEntry { + // replica id where the stale location comes from. + private final int fromReplicaId; + + // timestamp in milliseconds + private final long timestamp; + + private final byte[] endKey; + + StaleLocationCacheEntry(final int metaReplicaId, final byte[] endKey) { + this.fromReplicaId = metaReplicaId; + this.endKey = endKey; + timestamp = EnvironmentEdgeManager.currentTime(); + } + + public byte[] getEndKey() { + return this.endKey; + } + + public int getFromReplicaId() { + return this.fromReplicaId; + } + public long getTimestamp() { + return this.timestamp; + } + + @Override + public String toString() { + return new ToStringBuilder(this) + .append("endKey", endKey) + .append("fromReplicaId", fromReplicaId) + .append("timestamp", timestamp) + .toString(); + } + } + + private final ConcurrentMap<TableName, ConcurrentNavigableMap<byte[], StaleLocationCacheEntry>> + staleCache = new ConcurrentHashMap<>(); + private volatile int numOfReplicas; + private final AsyncConnectionImpl conn; + private final TableName tableName; + private final IntSupplier getNumOfReplicas; + private volatile boolean isStopped = false; + + CatalogReplicaLoadBalanceSimpleSelector(TableName tableName, AsyncConnectionImpl conn, + IntSupplier getNumOfReplicas) { + this.conn = conn; + this.tableName = tableName; + this.getNumOfReplicas = getNumOfReplicas; + + // This numOfReplicas is going to be lazy initialized. + this.numOfReplicas = -1; + // Start chores + this.conn.getChoreService().scheduleChore(getCacheCleanupChore(this)); + this.conn.getChoreService().scheduleChore(getRefreshReplicaCountChore(this)); + } + + /** + * When a client runs into RegionNotServingException, it will call this method to + * update Selector's internal state. + * @param loc the location which causes exception. + * @param fromReplicaId the replica id where the stale location comes from. + */ + public void onError(HRegionLocation loc, int fromReplicaId) { + ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = + computeIfAbsent(staleCache, loc.getRegion().getTable(), + () -> new ConcurrentSkipListMap<>(BYTES_COMPARATOR)); + byte[] startKey = loc.getRegion().getStartKey(); + tableCache.putIfAbsent(startKey, + new StaleLocationCacheEntry(fromReplicaId, loc.getRegion().getEndKey())); + LOG.debug("Add entry to stale cache for table {} with startKey {}, {}", + loc.getRegion().getTable(), startKey, loc.getRegion().getEndKey()); + } + + /** + * Select an random replica id. In case there is no replica region configured, return + * the primary replica id. + * @return Replica id + */ + private int getRandomReplicaId() { + int cachedNumOfReplicas = this.numOfReplicas; + if (cachedNumOfReplicas < 0) { + cachedNumOfReplicas = refreshCatalogReplicaCount(); + this.numOfReplicas = cachedNumOfReplicas; + } + // In case of no replica configured, return the primary region id. + if (cachedNumOfReplicas <= 1) { + return RegionInfo.DEFAULT_REPLICA_ID; + } + return 1 + ThreadLocalRandom.current().nextInt(cachedNumOfReplicas - 1); + } + + /** + * When it looks up a location, it will call this method to find a replica region to go. + * For a normal case, > 99% of region locations from catalog/meta replica will be up to date. + * In extreme cases such as region server crashes, it will depends on how fast replication + * catches up. + * + * @param tablename table name it looks up + * @param row key it looks up. + * @param locateType locateType, Only BEFORE and CURRENT will be passed in. + * @return catalog replica id + */ + public int select(final TableName tablename, final byte[] row, + final RegionLocateType locateType) { + ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = staleCache.get(tablename); + + // If there is no entry in StaleCache, select a random replica id. + if (tableCache == null) { + return getRandomReplicaId(); + } + + Map.Entry<byte[], StaleLocationCacheEntry> entry; + boolean isEmptyStopRow = isEmptyStopRow(row); + // Only BEFORE and CURRENT are passed in. + if (locateType == RegionLocateType.BEFORE) { + entry = isEmptyStopRow ? tableCache.lastEntry() : tableCache.lowerEntry(row); + } else { + entry = tableCache.floorEntry(row); + } + + // It is not in the stale cache, return a random replica id. + if (entry == null) { + return getRandomReplicaId(); + } + + // The entry here is a possible match for the location. Check if the entry times out first as + // long comparing is faster than comparing byte arrays(in most cases). It could remove + // stale entries faster. If the possible match entry does not time out, it will check if + // the entry is a match for the row passed in and select the replica id accordingly. + if ((EnvironmentEdgeManager.currentTime() - entry.getValue().getTimestamp()) >= + STALE_CACHE_TIMEOUT_IN_MILLISECONDS) { + LOG.debug("Entry for table {} with startKey {}, {} times out", tablename, entry.getKey(), + entry); + tableCache.remove(entry.getKey()); + return getRandomReplicaId(); + } + + byte[] endKey = entry.getValue().getEndKey(); + + // The following logic is borrowed from AsyncNonMetaRegionLocator. + if (isEmptyStopRow(endKey)) { + LOG.debug("Lookup {} goes to primary region", row); + return RegionInfo.DEFAULT_REPLICA_ID; + } + + if (locateType == RegionLocateType.BEFORE) { + if (!isEmptyStopRow && Bytes.compareTo(endKey, row) >= 0) { + LOG.debug("Lookup {} goes to primary meta", row); + return RegionInfo.DEFAULT_REPLICA_ID; + } + } else { + if (Bytes.compareTo(row, endKey) < 0) { + LOG.debug("Lookup {} goes to primary meta", row); + return RegionInfo.DEFAULT_REPLICA_ID; + } + } + + // Not in stale cache, return a random replica id. + return getRandomReplicaId(); + } + + @Override + public void stop(String why) { Review comment: It is not called currently. AsyncConnectionImpl (owning choreService) cancels chores when closing. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java ########## @@ -577,6 +635,15 @@ private void removeLocationFromCache(HRegionLocation loc) { if (!canUpdateOnError(loc, oldLoc)) { return; } + // Tell metaReplicaSelector that the location is stale. It will create a stale entry + // with timestamp internally. Next time the client looks up the same location, + // it will pick a different meta replica region. For the current implementation, + // the metaReplicaId is not used, so the primary one is passed in. + if (this.metaReplicaMode == CatalogReplicaMode.LoadBalance) { + // metaReplicaId is not used in simpleSelector, default to the primary one. + metaReplicaSelector.onError(loc, RegionInfo.DEFAULT_REPLICA_ID); Review comment: As discussed, removed fromReplicaId for now. Will go back to this part later when it is needed. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/RegionLocations.java ########## @@ -86,6 +91,25 @@ public RegionLocations(HRegionLocation... locations) { this.numNonNullElements = numNonNullElements; } + /** + * Constructs the region location list. The locations array should + * contain all the locations for known replicas for the region, and should be + * sorted in replicaId ascending order, although it can contain nulls indicating replicaIds + * that the locations of which are not known. + * @param locations an array of HRegionLocations for the same region range + */ + public RegionLocations(HRegionLocation... locations) { Review comment: The latest patch undid this change so there is no change in RegionLocations. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java ########## @@ -196,8 +202,43 @@ private boolean tryComplete(LocateRequest req, CompletableFuture<RegionLocations MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE, DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE); this.locatePrefetchLimit = conn.getConfiguration().getInt(LOCATE_PREFETCH_LIMIT, DEFAULT_LOCATE_PREFETCH_LIMIT); - this.useMetaReplicas = - conn.getConfiguration().getBoolean(USE_META_REPLICAS, DEFAULT_USE_META_REPLICAS); + + // Get the region locator's meta replica mode. + this.metaReplicaMode = CatalogReplicaMode.valueOf(conn.getConfiguration() + .get(LOCATOR_META_REPLICAS_MODE, CatalogReplicaMode.None.toString())); + + switch (this.metaReplicaMode) { + case LoadBalance: + String replicaSelectorClass = conn.getConfiguration(). + get(RegionLocator.LOCATOR_META_REPLICAS_MODE_LOADBALANCE_SELECTOR, + CatalogReplicaLoadBalanceSimpleSelector.class.getName()); + + this.metaReplicaSelector = CatalogReplicaLoadBalanceSelectorFactory.createSelector( + replicaSelectorClass, META_TABLE_NAME, conn, () -> { + int numOfReplicas = 1; + try { + RegionLocations metaLocations = conn.registry.getMetaRegionLocations().get( + GET_META_LOCATIONS_TIMEOUT, TimeUnit.MILLISECONDS); + numOfReplicas = metaLocations.size(); + } catch (Exception e) { + LOG.error("Failed to get table {}'s region replication, ", META_TABLE_NAME, e); + } + return numOfReplicas; + }); + break; + case None: + // If user does not configure LOCATOR_META_REPLICAS_MODE, let's check the legacy config. + if (this.metaReplicaMode == CatalogReplicaMode.None) { Review comment: Nice catch! ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java ########## @@ -196,8 +202,43 @@ private boolean tryComplete(LocateRequest req, CompletableFuture<RegionLocations MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE, DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE); this.locatePrefetchLimit = conn.getConfiguration().getInt(LOCATE_PREFETCH_LIMIT, DEFAULT_LOCATE_PREFETCH_LIMIT); - this.useMetaReplicas = - conn.getConfiguration().getBoolean(USE_META_REPLICAS, DEFAULT_USE_META_REPLICAS); + + // Get the region locator's meta replica mode. + this.metaReplicaMode = CatalogReplicaMode.valueOf(conn.getConfiguration() + .get(LOCATOR_META_REPLICAS_MODE, CatalogReplicaMode.None.toString())); + + switch (this.metaReplicaMode) { + case LoadBalance: + String replicaSelectorClass = conn.getConfiguration(). Review comment: The factory is CatalogReplicaLoadBalanceSelectorFactory, which could be used for future root table. The configuration for meta and future system tables are different so I think it is better to exclude all these table specific configuration outside of CatalogReplicaLoadBalanceSelectorFactory. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java ########## @@ -0,0 +1,293 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.client; + +import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow; +import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR; +import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent; +import java.util.Iterator; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ConcurrentNavigableMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.IntSupplier; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.hadoop.hbase.HRegionLocation; +import org.apache.hadoop.hbase.ScheduledChore; +import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * <p>CatalogReplicaLoadBalanceReplicaSimpleSelector implements a simple catalog replica load balancing + * algorithm. It maintains a stale location cache for each table. Whenever client looks up location, + * it first check if the row is the stale location cache. If yes, the location from + * catalog replica is stale, it will go to the primary region to look up update-to-date location; + * otherwise, it will randomly pick up a replica region for lookup. When clients receive + * RegionNotServedException from region servers, it will add these region locations to the stale + * location cache. The stale cache will be cleaned up periodically by a chore.</p> + * + * It follows a simple algorithm to choose a replica to go: + * + * <ol> + * <li>If there is no stale location entry for rows it looks up, it will randomly + * pick a replica region to do lookup. </li> + * <li>If the location from the replica region is stale, client gets RegionNotServedException + * from region server, in this case, it will create StaleLocationCacheEntry in + * CatalogReplicaLoadBalanceReplicaSimpleSelector.</li> + * <li>When client tries to do location lookup, it checks StaleLocationCache first for rows it + * tries to lookup, if entry exists, it will go with primary meta region to do lookup; + * otherwise, it will follow step 1.</li> + * <li>A chore will periodically run to clean up cache entries in the StaleLocationCache.</li> + * </ol> + */ +class CatalogReplicaLoadBalanceSimpleSelector implements + CatalogReplicaLoadBalanceSelector, Stoppable { + private static final Logger LOG = + LoggerFactory.getLogger(CatalogReplicaLoadBalanceSimpleSelector.class); + private final long STALE_CACHE_TIMEOUT_IN_MILLISECONDS = 3000; // 3 seconds + private final int STALE_CACHE_CLEAN_CHORE_INTERVAL_IN_MILLISECONDS = 1500; // 1.5 seconds + private final int REFRESH_REPLICA_COUNT_CHORE_INTERVAL_IN_MILLISECONDS = 60000; // 1 minute + + /** + * StaleLocationCacheEntry is the entry when a stale location is reported by an client. + */ + private static final class StaleLocationCacheEntry { + // timestamp in milliseconds + private final long timestamp; + + private final byte[] endKey; + + StaleLocationCacheEntry(final byte[] endKey) { + this.endKey = endKey; + timestamp = EnvironmentEdgeManager.currentTime(); + } + + public byte[] getEndKey() { + return this.endKey; + } + + public long getTimestamp() { + return this.timestamp; + } + + @Override + public String toString() { + return new ToStringBuilder(this) + .append("endKey", endKey) + .append("timestamp", timestamp) + .toString(); + } + } + + private final ConcurrentMap<TableName, ConcurrentNavigableMap<byte[], StaleLocationCacheEntry>> + staleCache = new ConcurrentHashMap<>(); + private volatile int numOfReplicas; + private final AsyncConnectionImpl conn; + private final TableName tableName; + private final IntSupplier getNumOfReplicas; + private volatile boolean isStopped = false; + + CatalogReplicaLoadBalanceSimpleSelector(TableName tableName, AsyncConnectionImpl conn, + IntSupplier getNumOfReplicas) { + this.conn = conn; + this.tableName = tableName; + this.getNumOfReplicas = getNumOfReplicas; + + // This numOfReplicas is going to be lazy initialized. + this.numOfReplicas = -1; + // Start chores + this.conn.getChoreService().scheduleChore(getCacheCleanupChore(this)); + this.conn.getChoreService().scheduleChore(getRefreshReplicaCountChore(this)); + } + + /** + * When a client runs into RegionNotServingException, it will call this method to + * update Selector's internal state. + * @param loc the location which causes exception. + */ + public void onError(HRegionLocation loc) { + ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = + computeIfAbsent(staleCache, loc.getRegion().getTable(), + () -> new ConcurrentSkipListMap<>(BYTES_COMPARATOR)); + byte[] startKey = loc.getRegion().getStartKey(); + tableCache.putIfAbsent(startKey, + new StaleLocationCacheEntry(loc.getRegion().getEndKey())); + LOG.debug("Add entry to stale cache for table {} with startKey {}, {}", + loc.getRegion().getTable(), startKey, loc.getRegion().getEndKey()); + } + + /** + * Select an random replica id. In case there is no replica region configured, return + * the primary replica id. + * @return Replica id + */ + private int getRandomReplicaId() { + int cachedNumOfReplicas = this.numOfReplicas; + if (cachedNumOfReplicas < 0) { + cachedNumOfReplicas = refreshCatalogReplicaCount(); + this.numOfReplicas = cachedNumOfReplicas; + } + // In case of no replica configured, return the primary region id. + if (cachedNumOfReplicas <= 1) { + return RegionInfo.DEFAULT_REPLICA_ID; + } + return 1 + ThreadLocalRandom.current().nextInt(cachedNumOfReplicas - 1); + } + + /** + * When it looks up a location, it will call this method to find a replica region to go. + * For a normal case, > 99% of region locations from catalog/meta replica will be up to date. + * In extreme cases such as region server crashes, it will depends on how fast replication + * catches up. + * + * @param tablename table name it looks up + * @param row key it looks up. + * @param locateType locateType, Only BEFORE and CURRENT will be passed in. + * @return catalog replica id + */ + public int select(final TableName tablename, final byte[] row, + final RegionLocateType locateType) { + ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = staleCache.get(tablename); + + // If there is no entry in StaleCache, select a random replica id. + if (tableCache == null) { + return getRandomReplicaId(); + } + + Map.Entry<byte[], StaleLocationCacheEntry> entry; + boolean isEmptyStopRow = isEmptyStopRow(row); + // Only BEFORE and CURRENT are passed in. Review comment: Sorry, missed this before. Just checked, Preconditions.checkArgument() already throws out IllegalArgumentException, so "should not happen" is not needed? ########## File path: hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncNonMetaRegionLocator.java ########## @@ -55,35 +58,53 @@ import org.junit.ClassRule; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @Category({ MediumTests.class, ClientTests.class }) +@RunWith(Parameterized.class) Review comment: I roughly went through MiniClusterRule, seems that I still need to keep @BeforeClass to do set of things. Would like to use it in the new tests, keep it as it is for existing test case for now. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java ########## @@ -0,0 +1,302 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.client; + +import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow; +import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR; +import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent; +import java.util.Iterator; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ConcurrentNavigableMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.IntSupplier; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.hadoop.hbase.HRegionLocation; +import org.apache.hadoop.hbase.ScheduledChore; +import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * <p>CatalogReplicaLoadBalanceReplicaSimpleSelector implements a simple catalog replica load balancing + * algorithm. It maintains a stale location cache for each table. Whenever client looks up location, + * it first check if the row is the stale location cache. If yes, the location from + * catalog replica is stale, it will go to the primary region to look up update-to-date location; + * otherwise, it will randomly pick up a replica region for lookup. When clients receive + * RegionNotServedException from region servers, it will add these region locations to the stale + * location cache. The stale cache will be cleaned up periodically by a chore.</p> + * + * It follows a simple algorithm to choose a replica to go: + * + * <ol> + * <li>If there is no stale location entry for rows it looks up, it will randomly + * pick a replica region to do lookup. </li> + * <li>If the location from the replica region is stale, client gets RegionNotServedException + * from region server, in this case, it will create StaleLocationCacheEntry in + * CatalogReplicaLoadBalanceReplicaSimpleSelector.</li> + * <li>When client tries to do location lookup, it checks StaleLocationCache first for rows it + * tries to lookup, if entry exists, it will go with primary meta region to do lookup; + * otherwise, it will follow step 1.</li> + * <li>A chore will periodically run to clean up cache entries in the StaleLocationCache.</li> + * </ol> + */ +class CatalogReplicaLoadBalanceSimpleSelector implements + CatalogReplicaLoadBalanceSelector, Stoppable { + private static final Logger LOG = + LoggerFactory.getLogger(CatalogReplicaLoadBalanceSimpleSelector.class); + private final long STALE_CACHE_TIMEOUT_IN_MILLISECONDS = 3000; // 3 seconds + private final int STALE_CACHE_CLEAN_CHORE_INTERVAL_IN_MILLISECONDS = 1500; // 1.5 seconds + private final int REFRESH_REPLICA_COUNT_CHORE_INTERVAL_IN_MILLISECONDS = 60000; // 1 minute + + /** + * StaleLocationCacheEntry is the entry when a stale location is reported by an client. + */ + private static final class StaleLocationCacheEntry { + // replica id where the stale location comes from. + private final int fromReplicaId; + + // timestamp in milliseconds + private final long timestamp; + + private final byte[] endKey; + + StaleLocationCacheEntry(final int metaReplicaId, final byte[] endKey) { + this.fromReplicaId = metaReplicaId; + this.endKey = endKey; + timestamp = EnvironmentEdgeManager.currentTime(); + } + + public byte[] getEndKey() { + return this.endKey; + } + + public int getFromReplicaId() { + return this.fromReplicaId; + } + public long getTimestamp() { + return this.timestamp; + } + + @Override + public String toString() { + return new ToStringBuilder(this) + .append("endKey", endKey) + .append("fromReplicaId", fromReplicaId) + .append("timestamp", timestamp) + .toString(); + } + } + + private final ConcurrentMap<TableName, ConcurrentNavigableMap<byte[], StaleLocationCacheEntry>> + staleCache = new ConcurrentHashMap<>(); + private volatile int numOfReplicas; + private final AsyncConnectionImpl conn; + private final TableName tableName; + private final IntSupplier getNumOfReplicas; + private volatile boolean isStopped = false; + + CatalogReplicaLoadBalanceSimpleSelector(TableName tableName, AsyncConnectionImpl conn, + IntSupplier getNumOfReplicas) { + this.conn = conn; + this.tableName = tableName; + this.getNumOfReplicas = getNumOfReplicas; + + // This numOfReplicas is going to be lazy initialized. + this.numOfReplicas = -1; + // Start chores + this.conn.getChoreService().scheduleChore(getCacheCleanupChore(this)); + this.conn.getChoreService().scheduleChore(getRefreshReplicaCountChore(this)); + } + + /** + * When a client runs into RegionNotServingException, it will call this method to + * update Selector's internal state. + * @param loc the location which causes exception. + * @param fromReplicaId the replica id where the stale location comes from. + */ + public void onError(HRegionLocation loc, int fromReplicaId) { + ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = + computeIfAbsent(staleCache, loc.getRegion().getTable(), + () -> new ConcurrentSkipListMap<>(BYTES_COMPARATOR)); + byte[] startKey = loc.getRegion().getStartKey(); + tableCache.putIfAbsent(startKey, + new StaleLocationCacheEntry(fromReplicaId, loc.getRegion().getEndKey())); + LOG.debug("Add entry to stale cache for table {} with startKey {}, {}", + loc.getRegion().getTable(), startKey, loc.getRegion().getEndKey()); + } + + /** + * Select an random replica id. In case there is no replica region configured, return + * the primary replica id. + * @return Replica id + */ + private int getRandomReplicaId() { + int cachedNumOfReplicas = this.numOfReplicas; + if (cachedNumOfReplicas < 0) { + cachedNumOfReplicas = refreshCatalogReplicaCount(); + this.numOfReplicas = cachedNumOfReplicas; + } + // In case of no replica configured, return the primary region id. + if (cachedNumOfReplicas <= 1) { + return RegionInfo.DEFAULT_REPLICA_ID; + } + return 1 + ThreadLocalRandom.current().nextInt(cachedNumOfReplicas - 1); + } + + /** + * When it looks up a location, it will call this method to find a replica region to go. + * For a normal case, > 99% of region locations from catalog/meta replica will be up to date. + * In extreme cases such as region server crashes, it will depends on how fast replication + * catches up. + * + * @param tablename table name it looks up + * @param row key it looks up. + * @param locateType locateType, Only BEFORE and CURRENT will be passed in. + * @return catalog replica id + */ + public int select(final TableName tablename, final byte[] row, + final RegionLocateType locateType) { + ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = staleCache.get(tablename); + + // If there is no entry in StaleCache, select a random replica id. + if (tableCache == null) { + return getRandomReplicaId(); + } + + Map.Entry<byte[], StaleLocationCacheEntry> entry; + boolean isEmptyStopRow = isEmptyStopRow(row); + // Only BEFORE and CURRENT are passed in. + if (locateType == RegionLocateType.BEFORE) { + entry = isEmptyStopRow ? tableCache.lastEntry() : tableCache.lowerEntry(row); + } else { + entry = tableCache.floorEntry(row); + } + + // It is not in the stale cache, return a random replica id. + if (entry == null) { + return getRandomReplicaId(); + } + + // The entry here is a possible match for the location. Check if the entry times out first as + // long comparing is faster than comparing byte arrays(in most cases). It could remove + // stale entries faster. If the possible match entry does not time out, it will check if + // the entry is a match for the row passed in and select the replica id accordingly. + if ((EnvironmentEdgeManager.currentTime() - entry.getValue().getTimestamp()) >= + STALE_CACHE_TIMEOUT_IN_MILLISECONDS) { + LOG.debug("Entry for table {} with startKey {}, {} times out", tablename, entry.getKey(), + entry); + tableCache.remove(entry.getKey()); + return getRandomReplicaId(); + } + + byte[] endKey = entry.getValue().getEndKey(); + + // The following logic is borrowed from AsyncNonMetaRegionLocator. + if (isEmptyStopRow(endKey)) { + LOG.debug("Lookup {} goes to primary region", row); + return RegionInfo.DEFAULT_REPLICA_ID; + } + + if (locateType == RegionLocateType.BEFORE) { + if (!isEmptyStopRow && Bytes.compareTo(endKey, row) >= 0) { + LOG.debug("Lookup {} goes to primary meta", row); + return RegionInfo.DEFAULT_REPLICA_ID; + } + } else { + if (Bytes.compareTo(row, endKey) < 0) { + LOG.debug("Lookup {} goes to primary meta", row); + return RegionInfo.DEFAULT_REPLICA_ID; + } + } + + // Not in stale cache, return a random replica id. + return getRandomReplicaId(); + } + + @Override + public void stop(String why) { Review comment: A Chore needs a Stoppable to be passed in. I thought that it is better to do in this way than creating a NoOp Stoppable object and pass it in (eg, getAuthRenewalChore() does). If there is better alternative, I can change it accordingly. ########## File path: hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java ########## @@ -82,14 +84,18 @@ static String LOCATE_PREFETCH_LIMIT = "hbase.client.locate.prefetch.limit"; private static final int DEFAULT_LOCATE_PREFETCH_LIMIT = 10; + private static final long GET_META_LOCATIONS_TIMEOUT_IN_MILLISECONDS = 2000; // 2 seconds Review comment: No, a new config is actually something I tried to avoid. "hbase.rpc.read.timeout"? The default value is 1 min, a bit longer but I think it is ok, can adjust based on running results. If this is not the config you meant, please let me the know the config, thanks. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org