huaxiangsun commented on a change in pull request #2584:
URL: https://github.com/apache/hbase/pull/2584#discussion_r515749525



##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java
##########
@@ -196,8 +202,43 @@ private boolean tryComplete(LocateRequest req, 
CompletableFuture<RegionLocations
       MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE, 
DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE);
     this.locatePrefetchLimit =
       conn.getConfiguration().getInt(LOCATE_PREFETCH_LIMIT, 
DEFAULT_LOCATE_PREFETCH_LIMIT);
-    this.useMetaReplicas =
-      conn.getConfiguration().getBoolean(USE_META_REPLICAS, 
DEFAULT_USE_META_REPLICAS);
+
+    // Get the region locator's meta replica mode.
+    this.metaReplicaMode = CatalogReplicaMode.valueOf(conn.getConfiguration()
+      .get(LOCATOR_META_REPLICAS_MODE, CatalogReplicaMode.None.toString()));
+
+    switch (this.metaReplicaMode) {
+      case LoadBalance:
+        String replicaSelectorClass = conn.getConfiguration().
+          get(RegionLocator.LOCATOR_META_REPLICAS_MODE_LOADBALANCE_SELECTOR,
+          CatalogReplicaLoadBalanceSimpleSelector.class.getName());
+
+        this.metaReplicaSelector = 
CatalogReplicaLoadBalanceSelectorFactory.createSelector(
+          replicaSelectorClass, META_TABLE_NAME, conn, () -> {
+            int numOfReplicas = 1;
+            try {
+              RegionLocations metaLocations = 
conn.registry.getMetaRegionLocations().get(
+                GET_META_LOCATIONS_TIMEOUT, TimeUnit.MILLISECONDS);
+              numOfReplicas = metaLocations.size();
+            } catch (Exception e) {
+              LOG.error("Failed to get table {}'s region replication, ", 
META_TABLE_NAME, e);
+            }
+            return numOfReplicas;
+          });
+        break;
+      case None:
+        // If user does not configure LOCATOR_META_REPLICAS_MODE, let's check 
the legacy config.
+        if (this.metaReplicaMode == CatalogReplicaMode.None) {
+          boolean useMetaReplicas = 
conn.getConfiguration().getBoolean(USE_META_REPLICAS,
+            DEFAULT_USE_META_REPLICAS);
+          if (useMetaReplicas) {
+            this.metaReplicaMode = CatalogReplicaMode.HedgedRead;
+          }
+        }
+        break;
+      default:
+        // Doing nothing

Review comment:
       There will be no exception as metaReplica is already an enum here. The 
default branch is added to avoid complaining from hadoop build.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/RegionLocations.java
##########
@@ -38,6 +38,9 @@
 
   private final int numNonNullElements;
 
+  // For Meta Replica LoadBalance mode, this indicates which meta replica the 
locations come from.
+  private int fromMetaReplicaId;

Review comment:
       Ok, going to remove the replica id for now. We can go back to it later 
when it is needed.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java
##########
@@ -82,14 +84,18 @@
   static String LOCATE_PREFETCH_LIMIT = "hbase.client.locate.prefetch.limit";
 
   private static final int DEFAULT_LOCATE_PREFETCH_LIMIT = 10;
+  private static final long GET_META_LOCATIONS_TIMEOUT = 2000; // 2 seconds

Review comment:
       This is the timeout value to wait for meta locations from registry. 

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncConnectionImpl.java
##########
@@ -182,8 +183,21 @@ public void newDead(ServerName sn) {
   }
 
   private void spawnRenewalChore(final UserGroupInformation user) {
-    authService = new ChoreService("Relogin service");
-    authService.scheduleChore(AuthUtil.getAuthRenewalChore(user));
+    ChoreService service = getChoreService();
+    service.scheduleChore(AuthUtil.getAuthRenewalChore(user));
+  }
+
+  /**
+   * If choreService has not been created yet, create the ChoreService.
+   * @return ChoreService
+   */
+  ChoreService getChoreService() {
+    synchronized (this) {
+      if (choreService == null) {
+        choreService = new ChoreService("AsyncConn Chore Service");
+      }
+    }
+    return choreService;

Review comment:
       Done  with the first approach.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java
##########
@@ -0,0 +1,302 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow;
+import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR;
+import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.IntSupplier;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.ScheduledChore;
+import org.apache.hadoop.hbase.Stoppable;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>CatalogReplicaLoadBalanceReplicaSimpleSelector implements a simple 
catalog replica load balancing
+ * algorithm. It maintains a stale location cache for each table. Whenever 
client looks up location,
+ * it first check if the row is the stale location cache. If yes, the location 
from
+ * catalog replica is stale, it will go to the primary region to look up 
update-to-date location;
+ * otherwise, it will randomly pick up a replica region for lookup. When 
clients receive
+ * RegionNotServedException from region servers, it will add these region 
locations to the stale
+ * location cache. The stale cache will be cleaned up periodically by a 
chore.</p>
+ *
+ * It follows a simple algorithm to choose a replica to go:
+ *
+ * <ol>
+ *  <li>If there is no stale location entry for rows it looks up, it will 
randomly
+ *     pick a replica region to do lookup. </li>
+ *  <li>If the location from the replica region is stale, client gets 
RegionNotServedException
+ *     from region server, in this case, it will create 
StaleLocationCacheEntry in
+ *     CatalogReplicaLoadBalanceReplicaSimpleSelector.</li>
+ *  <li>When client tries to do location lookup, it checks StaleLocationCache 
first for rows it
+ *     tries to lookup, if entry exists, it will go with primary meta region 
to do lookup;
+ *     otherwise, it will follow step 1.</li>
+ *  <li>A chore will periodically run to clean up cache entries in the 
StaleLocationCache.</li>
+ * </ol>
+ */
+class CatalogReplicaLoadBalanceSimpleSelector implements
+  CatalogReplicaLoadBalanceSelector, Stoppable {
+  private static final Logger LOG =
+    LoggerFactory.getLogger(CatalogReplicaLoadBalanceSimpleSelector.class);
+  private final long STALE_CACHE_TIMEOUT_IN_MILLISECONDS = 3000; // 3 seconds
+  private final int STALE_CACHE_CLEAN_CHORE_INTERVAL_IN_MILLISECONDS = 1500; 
// 1.5 seconds
+  private final int REFRESH_REPLICA_COUNT_CHORE_INTERVAL_IN_MILLISECONDS = 
60000; // 1 minute
+
+  /**
+   * StaleLocationCacheEntry is the entry when a stale location is reported by 
an client.
+   */
+  private static final class StaleLocationCacheEntry {
+    // replica id where the stale location comes from.
+    private final int fromReplicaId;
+
+    // timestamp in milliseconds
+    private final long timestamp;
+
+    private final byte[] endKey;
+
+    StaleLocationCacheEntry(final int metaReplicaId, final byte[] endKey) {
+      this.fromReplicaId = metaReplicaId;
+      this.endKey = endKey;
+      timestamp = EnvironmentEdgeManager.currentTime();
+    }
+
+    public byte[] getEndKey() {
+      return this.endKey;
+    }
+
+    public int getFromReplicaId() {
+      return this.fromReplicaId;
+    }
+    public long getTimestamp() {
+      return this.timestamp;
+    }
+
+    @Override
+    public String toString() {
+      return new ToStringBuilder(this)
+        .append("endKey", endKey)
+        .append("fromReplicaId", fromReplicaId)
+        .append("timestamp", timestamp)
+        .toString();
+    }
+  }
+
+  private final ConcurrentMap<TableName, ConcurrentNavigableMap<byte[], 
StaleLocationCacheEntry>>
+    staleCache = new ConcurrentHashMap<>();
+  private volatile int numOfReplicas;
+  private final AsyncConnectionImpl conn;
+  private final TableName tableName;
+  private final IntSupplier getNumOfReplicas;
+  private volatile boolean isStopped = false;
+
+  CatalogReplicaLoadBalanceSimpleSelector(TableName tableName, 
AsyncConnectionImpl conn,
+    IntSupplier getNumOfReplicas) {
+    this.conn = conn;
+    this.tableName = tableName;
+    this.getNumOfReplicas = getNumOfReplicas;
+
+    // This numOfReplicas is going to be lazy initialized.
+    this.numOfReplicas = -1;
+    // Start chores
+    this.conn.getChoreService().scheduleChore(getCacheCleanupChore(this));
+    
this.conn.getChoreService().scheduleChore(getRefreshReplicaCountChore(this));
+  }
+
+  /**
+   * When a client runs into RegionNotServingException, it will call this 
method to
+   * update Selector's internal state.
+   * @param loc the location which causes exception.
+   * @param fromReplicaId the replica id where the stale location comes from.
+   */
+  public void onError(HRegionLocation loc, int fromReplicaId) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache =
+      computeIfAbsent(staleCache, loc.getRegion().getTable(),
+        () -> new ConcurrentSkipListMap<>(BYTES_COMPARATOR));
+    byte[] startKey = loc.getRegion().getStartKey();
+    tableCache.putIfAbsent(startKey,
+      new StaleLocationCacheEntry(fromReplicaId, loc.getRegion().getEndKey()));
+    LOG.debug("Add entry to stale cache for table {} with startKey {}, {}",
+      loc.getRegion().getTable(), startKey, loc.getRegion().getEndKey());
+  }
+
+  /**
+   * Select an random replica id. In case there is no replica region 
configured, return
+   * the primary replica id.
+   * @return Replica id
+   */
+  private int getRandomReplicaId() {
+    int cachedNumOfReplicas = this.numOfReplicas;
+    if (cachedNumOfReplicas < 0) {
+      cachedNumOfReplicas = refreshCatalogReplicaCount();
+      this.numOfReplicas = cachedNumOfReplicas;
+    }
+    // In case of no replica configured, return the primary region id.
+    if (cachedNumOfReplicas <= 1) {
+      return RegionInfo.DEFAULT_REPLICA_ID;
+    }
+    return 1 + ThreadLocalRandom.current().nextInt(cachedNumOfReplicas - 1);
+  }
+
+  /**
+   * When it looks up a location, it will call this method to find a replica 
region to go.
+   * For a normal case, > 99% of region locations from catalog/meta replica 
will be up to date.
+   * In extreme cases such as region server crashes, it will depends on how 
fast replication
+   * catches up.
+   *
+   * @param tablename  table name it looks up
+   * @param row   key it looks up.
+   * @param locateType locateType, Only BEFORE and CURRENT will be passed in.
+   * @return catalog replica id
+   */
+  public int select(final TableName tablename, final byte[] row,
+    final RegionLocateType locateType) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = 
staleCache.get(tablename);
+
+    // If there is no entry in StaleCache, select a random replica id.
+    if (tableCache == null) {
+      return getRandomReplicaId();
+    }
+
+    Map.Entry<byte[], StaleLocationCacheEntry> entry;
+    boolean isEmptyStopRow = isEmptyStopRow(row);
+    // Only BEFORE and CURRENT are passed in.
+    if (locateType == RegionLocateType.BEFORE) {
+      entry = isEmptyStopRow ? tableCache.lastEntry() : 
tableCache.lowerEntry(row);
+    } else {
+      entry = tableCache.floorEntry(row);
+    }
+
+    // It is not in the stale cache, return a random replica id.
+    if (entry == null) {
+      return getRandomReplicaId();
+    }
+
+    // The entry here is a possible match for the location. Check if the entry 
times out first as
+    // long comparing is faster than comparing byte arrays(in most cases). It 
could remove
+    // stale entries faster. If the possible match entry does not time out, it 
will check if
+    // the entry is a match for the row passed in and select the replica id 
accordingly.
+    if ((EnvironmentEdgeManager.currentTime() - 
entry.getValue().getTimestamp()) >=
+      STALE_CACHE_TIMEOUT_IN_MILLISECONDS) {
+      LOG.debug("Entry for table {} with startKey {}, {} times out", 
tablename, entry.getKey(),
+        entry);
+      tableCache.remove(entry.getKey());
+      return getRandomReplicaId();
+    }
+
+    byte[] endKey =  entry.getValue().getEndKey();
+
+    // The following logic is borrowed from AsyncNonMetaRegionLocator.
+    if (isEmptyStopRow(endKey)) {
+      LOG.debug("Lookup {} goes to primary region", row);
+      return RegionInfo.DEFAULT_REPLICA_ID;
+    }
+
+    if (locateType == RegionLocateType.BEFORE) {
+      if (!isEmptyStopRow && Bytes.compareTo(endKey, row) >= 0) {
+        LOG.debug("Lookup {} goes to primary meta", row);
+        return RegionInfo.DEFAULT_REPLICA_ID;
+      }
+    } else {
+      if (Bytes.compareTo(row, endKey) < 0) {
+        LOG.debug("Lookup {} goes to primary meta", row);
+        return RegionInfo.DEFAULT_REPLICA_ID;
+      }
+    }
+
+    // Not in stale cache, return a random replica id.
+    return getRandomReplicaId();
+  }
+
+  @Override
+  public void stop(String why) {

Review comment:
       It is not called currently. AsyncConnectionImpl (owning choreService) 
cancels chores when closing. 

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java
##########
@@ -577,6 +635,15 @@ private void removeLocationFromCache(HRegionLocation loc) {
       if (!canUpdateOnError(loc, oldLoc)) {
         return;
       }
+      // Tell metaReplicaSelector that the location is stale. It will create a 
stale entry
+      // with timestamp internally. Next time the client looks up the same 
location,
+      // it will pick a different meta replica region. For the current 
implementation,
+      // the metaReplicaId is not used, so the primary one is passed in.
+      if (this.metaReplicaMode == CatalogReplicaMode.LoadBalance) {
+        // metaReplicaId is not used in simpleSelector, default to the primary 
one.
+        metaReplicaSelector.onError(loc, RegionInfo.DEFAULT_REPLICA_ID);

Review comment:
       As discussed, removed fromReplicaId for now. Will go back to this part 
later when it is needed.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/RegionLocations.java
##########
@@ -86,6 +91,25 @@ public RegionLocations(HRegionLocation... locations) {
     this.numNonNullElements = numNonNullElements;
   }
 
+  /**
+   * Constructs the region location list. The locations array should
+   * contain all the locations for known replicas for the region, and should be
+   * sorted in replicaId ascending order, although it can contain nulls 
indicating replicaIds
+   * that the locations of which are not known.
+   * @param locations an array of HRegionLocations for the same region range
+   */
+  public RegionLocations(HRegionLocation... locations) {

Review comment:
       The latest patch undid this change so there is no change in 
RegionLocations.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java
##########
@@ -196,8 +202,43 @@ private boolean tryComplete(LocateRequest req, 
CompletableFuture<RegionLocations
       MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE, 
DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE);
     this.locatePrefetchLimit =
       conn.getConfiguration().getInt(LOCATE_PREFETCH_LIMIT, 
DEFAULT_LOCATE_PREFETCH_LIMIT);
-    this.useMetaReplicas =
-      conn.getConfiguration().getBoolean(USE_META_REPLICAS, 
DEFAULT_USE_META_REPLICAS);
+
+    // Get the region locator's meta replica mode.
+    this.metaReplicaMode = CatalogReplicaMode.valueOf(conn.getConfiguration()
+      .get(LOCATOR_META_REPLICAS_MODE, CatalogReplicaMode.None.toString()));
+
+    switch (this.metaReplicaMode) {
+      case LoadBalance:
+        String replicaSelectorClass = conn.getConfiguration().
+          get(RegionLocator.LOCATOR_META_REPLICAS_MODE_LOADBALANCE_SELECTOR,
+          CatalogReplicaLoadBalanceSimpleSelector.class.getName());
+
+        this.metaReplicaSelector = 
CatalogReplicaLoadBalanceSelectorFactory.createSelector(
+          replicaSelectorClass, META_TABLE_NAME, conn, () -> {
+            int numOfReplicas = 1;
+            try {
+              RegionLocations metaLocations = 
conn.registry.getMetaRegionLocations().get(
+                GET_META_LOCATIONS_TIMEOUT, TimeUnit.MILLISECONDS);
+              numOfReplicas = metaLocations.size();
+            } catch (Exception e) {
+              LOG.error("Failed to get table {}'s region replication, ", 
META_TABLE_NAME, e);
+            }
+            return numOfReplicas;
+          });
+        break;
+      case None:
+        // If user does not configure LOCATOR_META_REPLICAS_MODE, let's check 
the legacy config.
+        if (this.metaReplicaMode == CatalogReplicaMode.None) {

Review comment:
       Nice catch!

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java
##########
@@ -196,8 +202,43 @@ private boolean tryComplete(LocateRequest req, 
CompletableFuture<RegionLocations
       MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE, 
DEFAULT_MAX_CONCURRENT_LOCATE_REQUEST_PER_TABLE);
     this.locatePrefetchLimit =
       conn.getConfiguration().getInt(LOCATE_PREFETCH_LIMIT, 
DEFAULT_LOCATE_PREFETCH_LIMIT);
-    this.useMetaReplicas =
-      conn.getConfiguration().getBoolean(USE_META_REPLICAS, 
DEFAULT_USE_META_REPLICAS);
+
+    // Get the region locator's meta replica mode.
+    this.metaReplicaMode = CatalogReplicaMode.valueOf(conn.getConfiguration()
+      .get(LOCATOR_META_REPLICAS_MODE, CatalogReplicaMode.None.toString()));
+
+    switch (this.metaReplicaMode) {
+      case LoadBalance:
+        String replicaSelectorClass = conn.getConfiguration().

Review comment:
       The factory is CatalogReplicaLoadBalanceSelectorFactory, which could be 
used for future root table. The configuration for meta and future system tables 
are different so I think it is better to exclude all these table specific 
configuration outside of CatalogReplicaLoadBalanceSelectorFactory.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java
##########
@@ -0,0 +1,293 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow;
+import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR;
+import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.IntSupplier;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.ScheduledChore;
+import org.apache.hadoop.hbase.Stoppable;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>CatalogReplicaLoadBalanceReplicaSimpleSelector implements a simple 
catalog replica load balancing
+ * algorithm. It maintains a stale location cache for each table. Whenever 
client looks up location,
+ * it first check if the row is the stale location cache. If yes, the location 
from
+ * catalog replica is stale, it will go to the primary region to look up 
update-to-date location;
+ * otherwise, it will randomly pick up a replica region for lookup. When 
clients receive
+ * RegionNotServedException from region servers, it will add these region 
locations to the stale
+ * location cache. The stale cache will be cleaned up periodically by a 
chore.</p>
+ *
+ * It follows a simple algorithm to choose a replica to go:
+ *
+ * <ol>
+ *  <li>If there is no stale location entry for rows it looks up, it will 
randomly
+ *     pick a replica region to do lookup. </li>
+ *  <li>If the location from the replica region is stale, client gets 
RegionNotServedException
+ *     from region server, in this case, it will create 
StaleLocationCacheEntry in
+ *     CatalogReplicaLoadBalanceReplicaSimpleSelector.</li>
+ *  <li>When client tries to do location lookup, it checks StaleLocationCache 
first for rows it
+ *     tries to lookup, if entry exists, it will go with primary meta region 
to do lookup;
+ *     otherwise, it will follow step 1.</li>
+ *  <li>A chore will periodically run to clean up cache entries in the 
StaleLocationCache.</li>
+ * </ol>
+ */
+class CatalogReplicaLoadBalanceSimpleSelector implements
+  CatalogReplicaLoadBalanceSelector, Stoppable {
+  private static final Logger LOG =
+    LoggerFactory.getLogger(CatalogReplicaLoadBalanceSimpleSelector.class);
+  private final long STALE_CACHE_TIMEOUT_IN_MILLISECONDS = 3000; // 3 seconds
+  private final int STALE_CACHE_CLEAN_CHORE_INTERVAL_IN_MILLISECONDS = 1500; 
// 1.5 seconds
+  private final int REFRESH_REPLICA_COUNT_CHORE_INTERVAL_IN_MILLISECONDS = 
60000; // 1 minute
+
+  /**
+   * StaleLocationCacheEntry is the entry when a stale location is reported by 
an client.
+   */
+  private static final class StaleLocationCacheEntry {
+    // timestamp in milliseconds
+    private final long timestamp;
+
+    private final byte[] endKey;
+
+    StaleLocationCacheEntry(final byte[] endKey) {
+      this.endKey = endKey;
+      timestamp = EnvironmentEdgeManager.currentTime();
+    }
+
+    public byte[] getEndKey() {
+      return this.endKey;
+    }
+
+    public long getTimestamp() {
+      return this.timestamp;
+    }
+
+    @Override
+    public String toString() {
+      return new ToStringBuilder(this)
+        .append("endKey", endKey)
+        .append("timestamp", timestamp)
+        .toString();
+    }
+  }
+
+  private final ConcurrentMap<TableName, ConcurrentNavigableMap<byte[], 
StaleLocationCacheEntry>>
+    staleCache = new ConcurrentHashMap<>();
+  private volatile int numOfReplicas;
+  private final AsyncConnectionImpl conn;
+  private final TableName tableName;
+  private final IntSupplier getNumOfReplicas;
+  private volatile boolean isStopped = false;
+
+  CatalogReplicaLoadBalanceSimpleSelector(TableName tableName, 
AsyncConnectionImpl conn,
+    IntSupplier getNumOfReplicas) {
+    this.conn = conn;
+    this.tableName = tableName;
+    this.getNumOfReplicas = getNumOfReplicas;
+
+    // This numOfReplicas is going to be lazy initialized.
+    this.numOfReplicas = -1;
+    // Start chores
+    this.conn.getChoreService().scheduleChore(getCacheCleanupChore(this));
+    
this.conn.getChoreService().scheduleChore(getRefreshReplicaCountChore(this));
+  }
+
+  /**
+   * When a client runs into RegionNotServingException, it will call this 
method to
+   * update Selector's internal state.
+   * @param loc the location which causes exception.
+   */
+  public void onError(HRegionLocation loc) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache =
+      computeIfAbsent(staleCache, loc.getRegion().getTable(),
+        () -> new ConcurrentSkipListMap<>(BYTES_COMPARATOR));
+    byte[] startKey = loc.getRegion().getStartKey();
+    tableCache.putIfAbsent(startKey,
+      new StaleLocationCacheEntry(loc.getRegion().getEndKey()));
+    LOG.debug("Add entry to stale cache for table {} with startKey {}, {}",
+      loc.getRegion().getTable(), startKey, loc.getRegion().getEndKey());
+  }
+
+  /**
+   * Select an random replica id. In case there is no replica region 
configured, return
+   * the primary replica id.
+   * @return Replica id
+   */
+  private int getRandomReplicaId() {
+    int cachedNumOfReplicas = this.numOfReplicas;
+    if (cachedNumOfReplicas < 0) {
+      cachedNumOfReplicas = refreshCatalogReplicaCount();
+      this.numOfReplicas = cachedNumOfReplicas;
+    }
+    // In case of no replica configured, return the primary region id.
+    if (cachedNumOfReplicas <= 1) {
+      return RegionInfo.DEFAULT_REPLICA_ID;
+    }
+    return 1 + ThreadLocalRandom.current().nextInt(cachedNumOfReplicas - 1);
+  }
+
+  /**
+   * When it looks up a location, it will call this method to find a replica 
region to go.
+   * For a normal case, > 99% of region locations from catalog/meta replica 
will be up to date.
+   * In extreme cases such as region server crashes, it will depends on how 
fast replication
+   * catches up.
+   *
+   * @param tablename  table name it looks up
+   * @param row   key it looks up.
+   * @param locateType locateType, Only BEFORE and CURRENT will be passed in.
+   * @return catalog replica id
+   */
+  public int select(final TableName tablename, final byte[] row,
+    final RegionLocateType locateType) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = 
staleCache.get(tablename);
+
+    // If there is no entry in StaleCache, select a random replica id.
+    if (tableCache == null) {
+      return getRandomReplicaId();
+    }
+
+    Map.Entry<byte[], StaleLocationCacheEntry> entry;
+    boolean isEmptyStopRow = isEmptyStopRow(row);
+    // Only BEFORE and CURRENT are passed in.

Review comment:
       Sorry, missed this before. Just checked,  Preconditions.checkArgument() 
already throws out IllegalArgumentException, so "should not happen" is not 
needed?

##########
File path: 
hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncNonMetaRegionLocator.java
##########
@@ -55,35 +58,53 @@
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 @Category({ MediumTests.class, ClientTests.class })
+@RunWith(Parameterized.class)

Review comment:
       I roughly went through MiniClusterRule, seems that I still need to keep  
@BeforeClass to do set of things. Would like to use it in the new tests, keep 
it as it is for existing test case for now.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/CatalogReplicaLoadBalanceSimpleSelector.java
##########
@@ -0,0 +1,302 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import static org.apache.hadoop.hbase.client.ConnectionUtils.isEmptyStopRow;
+import static org.apache.hadoop.hbase.util.Bytes.BYTES_COMPARATOR;
+import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.IntSupplier;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.ScheduledChore;
+import org.apache.hadoop.hbase.Stoppable;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>CatalogReplicaLoadBalanceReplicaSimpleSelector implements a simple 
catalog replica load balancing
+ * algorithm. It maintains a stale location cache for each table. Whenever 
client looks up location,
+ * it first check if the row is the stale location cache. If yes, the location 
from
+ * catalog replica is stale, it will go to the primary region to look up 
update-to-date location;
+ * otherwise, it will randomly pick up a replica region for lookup. When 
clients receive
+ * RegionNotServedException from region servers, it will add these region 
locations to the stale
+ * location cache. The stale cache will be cleaned up periodically by a 
chore.</p>
+ *
+ * It follows a simple algorithm to choose a replica to go:
+ *
+ * <ol>
+ *  <li>If there is no stale location entry for rows it looks up, it will 
randomly
+ *     pick a replica region to do lookup. </li>
+ *  <li>If the location from the replica region is stale, client gets 
RegionNotServedException
+ *     from region server, in this case, it will create 
StaleLocationCacheEntry in
+ *     CatalogReplicaLoadBalanceReplicaSimpleSelector.</li>
+ *  <li>When client tries to do location lookup, it checks StaleLocationCache 
first for rows it
+ *     tries to lookup, if entry exists, it will go with primary meta region 
to do lookup;
+ *     otherwise, it will follow step 1.</li>
+ *  <li>A chore will periodically run to clean up cache entries in the 
StaleLocationCache.</li>
+ * </ol>
+ */
+class CatalogReplicaLoadBalanceSimpleSelector implements
+  CatalogReplicaLoadBalanceSelector, Stoppable {
+  private static final Logger LOG =
+    LoggerFactory.getLogger(CatalogReplicaLoadBalanceSimpleSelector.class);
+  private final long STALE_CACHE_TIMEOUT_IN_MILLISECONDS = 3000; // 3 seconds
+  private final int STALE_CACHE_CLEAN_CHORE_INTERVAL_IN_MILLISECONDS = 1500; 
// 1.5 seconds
+  private final int REFRESH_REPLICA_COUNT_CHORE_INTERVAL_IN_MILLISECONDS = 
60000; // 1 minute
+
+  /**
+   * StaleLocationCacheEntry is the entry when a stale location is reported by 
an client.
+   */
+  private static final class StaleLocationCacheEntry {
+    // replica id where the stale location comes from.
+    private final int fromReplicaId;
+
+    // timestamp in milliseconds
+    private final long timestamp;
+
+    private final byte[] endKey;
+
+    StaleLocationCacheEntry(final int metaReplicaId, final byte[] endKey) {
+      this.fromReplicaId = metaReplicaId;
+      this.endKey = endKey;
+      timestamp = EnvironmentEdgeManager.currentTime();
+    }
+
+    public byte[] getEndKey() {
+      return this.endKey;
+    }
+
+    public int getFromReplicaId() {
+      return this.fromReplicaId;
+    }
+    public long getTimestamp() {
+      return this.timestamp;
+    }
+
+    @Override
+    public String toString() {
+      return new ToStringBuilder(this)
+        .append("endKey", endKey)
+        .append("fromReplicaId", fromReplicaId)
+        .append("timestamp", timestamp)
+        .toString();
+    }
+  }
+
+  private final ConcurrentMap<TableName, ConcurrentNavigableMap<byte[], 
StaleLocationCacheEntry>>
+    staleCache = new ConcurrentHashMap<>();
+  private volatile int numOfReplicas;
+  private final AsyncConnectionImpl conn;
+  private final TableName tableName;
+  private final IntSupplier getNumOfReplicas;
+  private volatile boolean isStopped = false;
+
+  CatalogReplicaLoadBalanceSimpleSelector(TableName tableName, 
AsyncConnectionImpl conn,
+    IntSupplier getNumOfReplicas) {
+    this.conn = conn;
+    this.tableName = tableName;
+    this.getNumOfReplicas = getNumOfReplicas;
+
+    // This numOfReplicas is going to be lazy initialized.
+    this.numOfReplicas = -1;
+    // Start chores
+    this.conn.getChoreService().scheduleChore(getCacheCleanupChore(this));
+    
this.conn.getChoreService().scheduleChore(getRefreshReplicaCountChore(this));
+  }
+
+  /**
+   * When a client runs into RegionNotServingException, it will call this 
method to
+   * update Selector's internal state.
+   * @param loc the location which causes exception.
+   * @param fromReplicaId the replica id where the stale location comes from.
+   */
+  public void onError(HRegionLocation loc, int fromReplicaId) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache =
+      computeIfAbsent(staleCache, loc.getRegion().getTable(),
+        () -> new ConcurrentSkipListMap<>(BYTES_COMPARATOR));
+    byte[] startKey = loc.getRegion().getStartKey();
+    tableCache.putIfAbsent(startKey,
+      new StaleLocationCacheEntry(fromReplicaId, loc.getRegion().getEndKey()));
+    LOG.debug("Add entry to stale cache for table {} with startKey {}, {}",
+      loc.getRegion().getTable(), startKey, loc.getRegion().getEndKey());
+  }
+
+  /**
+   * Select an random replica id. In case there is no replica region 
configured, return
+   * the primary replica id.
+   * @return Replica id
+   */
+  private int getRandomReplicaId() {
+    int cachedNumOfReplicas = this.numOfReplicas;
+    if (cachedNumOfReplicas < 0) {
+      cachedNumOfReplicas = refreshCatalogReplicaCount();
+      this.numOfReplicas = cachedNumOfReplicas;
+    }
+    // In case of no replica configured, return the primary region id.
+    if (cachedNumOfReplicas <= 1) {
+      return RegionInfo.DEFAULT_REPLICA_ID;
+    }
+    return 1 + ThreadLocalRandom.current().nextInt(cachedNumOfReplicas - 1);
+  }
+
+  /**
+   * When it looks up a location, it will call this method to find a replica 
region to go.
+   * For a normal case, > 99% of region locations from catalog/meta replica 
will be up to date.
+   * In extreme cases such as region server crashes, it will depends on how 
fast replication
+   * catches up.
+   *
+   * @param tablename  table name it looks up
+   * @param row   key it looks up.
+   * @param locateType locateType, Only BEFORE and CURRENT will be passed in.
+   * @return catalog replica id
+   */
+  public int select(final TableName tablename, final byte[] row,
+    final RegionLocateType locateType) {
+    ConcurrentNavigableMap<byte[], StaleLocationCacheEntry> tableCache = 
staleCache.get(tablename);
+
+    // If there is no entry in StaleCache, select a random replica id.
+    if (tableCache == null) {
+      return getRandomReplicaId();
+    }
+
+    Map.Entry<byte[], StaleLocationCacheEntry> entry;
+    boolean isEmptyStopRow = isEmptyStopRow(row);
+    // Only BEFORE and CURRENT are passed in.
+    if (locateType == RegionLocateType.BEFORE) {
+      entry = isEmptyStopRow ? tableCache.lastEntry() : 
tableCache.lowerEntry(row);
+    } else {
+      entry = tableCache.floorEntry(row);
+    }
+
+    // It is not in the stale cache, return a random replica id.
+    if (entry == null) {
+      return getRandomReplicaId();
+    }
+
+    // The entry here is a possible match for the location. Check if the entry 
times out first as
+    // long comparing is faster than comparing byte arrays(in most cases). It 
could remove
+    // stale entries faster. If the possible match entry does not time out, it 
will check if
+    // the entry is a match for the row passed in and select the replica id 
accordingly.
+    if ((EnvironmentEdgeManager.currentTime() - 
entry.getValue().getTimestamp()) >=
+      STALE_CACHE_TIMEOUT_IN_MILLISECONDS) {
+      LOG.debug("Entry for table {} with startKey {}, {} times out", 
tablename, entry.getKey(),
+        entry);
+      tableCache.remove(entry.getKey());
+      return getRandomReplicaId();
+    }
+
+    byte[] endKey =  entry.getValue().getEndKey();
+
+    // The following logic is borrowed from AsyncNonMetaRegionLocator.
+    if (isEmptyStopRow(endKey)) {
+      LOG.debug("Lookup {} goes to primary region", row);
+      return RegionInfo.DEFAULT_REPLICA_ID;
+    }
+
+    if (locateType == RegionLocateType.BEFORE) {
+      if (!isEmptyStopRow && Bytes.compareTo(endKey, row) >= 0) {
+        LOG.debug("Lookup {} goes to primary meta", row);
+        return RegionInfo.DEFAULT_REPLICA_ID;
+      }
+    } else {
+      if (Bytes.compareTo(row, endKey) < 0) {
+        LOG.debug("Lookup {} goes to primary meta", row);
+        return RegionInfo.DEFAULT_REPLICA_ID;
+      }
+    }
+
+    // Not in stale cache, return a random replica id.
+    return getRandomReplicaId();
+  }
+
+  @Override
+  public void stop(String why) {

Review comment:
       A Chore needs a Stoppable to be passed in. I thought that it is better 
to do in this way than creating a NoOp Stoppable object and pass it in (eg, 
getAuthRenewalChore() does). If there is better alternative, I can change it 
accordingly.

##########
File path: 
hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncNonMetaRegionLocator.java
##########
@@ -82,14 +84,18 @@
   static String LOCATE_PREFETCH_LIMIT = "hbase.client.locate.prefetch.limit";
 
   private static final int DEFAULT_LOCATE_PREFETCH_LIMIT = 10;
+  private static final long GET_META_LOCATIONS_TIMEOUT_IN_MILLISECONDS = 2000; 
// 2 seconds

Review comment:
       No, a new config is actually something I tried to avoid. 
"hbase.rpc.read.timeout"?  The default value is 1 min, a bit longer but I think 
it is ok, can adjust based on running results. If this is not the config you 
meant, please let me the know the config, thanks.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to