neils-dev commented on code in PR #3297:
URL: https://github.com/apache/ozone/pull/3297#discussion_r850027258


##########
hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/GrpcOmTransport.java:
##########
@@ -0,0 +1,330 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone.om.protocolPB;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.security.cert.X509Certificate;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.net.HostAndPort;
+import io.grpc.Status;
+import io.grpc.StatusRuntimeException;
+import org.apache.hadoop.ipc.RemoteException;
+
+import org.apache.hadoop.hdds.conf.Config;
+import org.apache.hadoop.hdds.conf.ConfigGroup;
+import org.apache.hadoop.hdds.conf.ConfigTag;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.security.x509.SecurityConfig;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
+import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.ozone.om.exceptions.OMException;
+import org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes;
+import 
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRequest;
+import 
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMResponse;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import org.apache.hadoop.ozone.om.ha.GrpcOMFailoverProxyProvider;
+import org.apache.hadoop.ozone.protocol.proto.OzoneManagerServiceGrpc;
+import io.grpc.ManagedChannel;
+import io.grpc.netty.GrpcSslContexts;
+import io.grpc.netty.NettyChannelBuilder;
+import io.netty.handler.ssl.SslContextBuilder;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.hadoop.ozone.om.OMConfigKeys
+    .OZONE_OM_GRPC_MAXIMUM_RESPONSE_LENGTH;
+import static org.apache.hadoop.ozone.om.OMConfigKeys
+    .OZONE_OM_GRPC_MAXIMUM_RESPONSE_LENGTH_DEFAULT;
+
+/**
+ * Grpc transport for grpc between s3g and om.
+ */
+public class GrpcOmTransport implements OmTransport {
+  private static final Logger LOG =
+      LoggerFactory.getLogger(GrpcOmTransport.class);
+
+  private static final String CLIENT_NAME = "GrpcOmTransport";
+  private final AtomicBoolean isRunning = new AtomicBoolean(false);
+
+  // gRPC specific
+  private static List<X509Certificate> caCerts = null;
+
+  private OzoneManagerServiceGrpc.OzoneManagerServiceBlockingStub client;
+  private Map<String,
+      OzoneManagerServiceGrpc.OzoneManagerServiceBlockingStub> clients;
+  private Map<String, ManagedChannel> channels;
+  private int lastVisited = -1;
+  private ConfigurationSource conf;
+
+  private AtomicReference<String> host;
+  private int maxSize;
+  private SecurityConfig secConfig;
+
+  public static void setCaCerts(List<X509Certificate> x509Certificates) {
+    caCerts = x509Certificates;
+  }
+
+  private List<String> oms;
+  private RetryPolicy retryPolicy;
+  private int failoverCount = 0;
+  private GrpcOMFailoverProxyProvider<OzoneManagerProtocolPB>
+      omFailoverProxyProvider;
+
+  public GrpcOmTransport(ConfigurationSource conf,
+                          UserGroupInformation ugi, String omServiceId)
+      throws IOException {
+
+    this.channels = new HashMap<>();
+    this.clients = new HashMap<>();
+    this.conf = conf;
+    this.host = new AtomicReference();
+
+    secConfig =  new SecurityConfig(conf);
+    maxSize = conf.getInt(OZONE_OM_GRPC_MAXIMUM_RESPONSE_LENGTH,
+        OZONE_OM_GRPC_MAXIMUM_RESPONSE_LENGTH_DEFAULT);
+
+    omFailoverProxyProvider = new GrpcOMFailoverProxyProvider(
+        conf,
+        ugi,
+        omServiceId,
+        OzoneManagerProtocolPB.class);
+
+    start();
+  }
+
+  public void start() throws IOException {
+    host.set(omFailoverProxyProvider
+        .getGrpcProxyAddress(
+            omFailoverProxyProvider.getCurrentProxyOMNodeId()));
+
+    if (!isRunning.compareAndSet(false, true)) {
+      LOG.info("Ignore. already started.");
+      return;
+    }
+
+    List<String> nodes = omFailoverProxyProvider.getGrpcOmNodeIDList();
+    for (String nodeId : nodes) {
+      String hostaddr = omFailoverProxyProvider.getGrpcProxyAddress(nodeId);
+      HostAndPort hp = HostAndPort.fromString(hostaddr);
+
+      NettyChannelBuilder channelBuilder =
+          NettyChannelBuilder.forAddress(hp.getHost(), hp.getPort())
+              .usePlaintext()
+              .maxInboundMessageSize(OzoneConsts.OZONE_SCM_CHUNK_MAX_SIZE);
+
+      if (secConfig.isGrpcTlsEnabled()) {
+        try {
+          SslContextBuilder sslContextBuilder = GrpcSslContexts.forClient();
+          if (secConfig.isSecurityEnabled()) {
+            if (caCerts != null) {
+              sslContextBuilder.trustManager(caCerts);
+            } else {
+              LOG.error("x509Certicates empty");
+            }
+            channelBuilder.useTransportSecurity().
+                sslContext(sslContextBuilder.build());
+          } else {
+            LOG.error("ozone.security not enabled when TLS specified," +
+                " using plaintext");
+          }
+        } catch (Exception ex) {
+          LOG.error("cannot establish TLS for grpc om transport client");
+        }
+      } else {
+        channelBuilder.usePlaintext();
+      }
+
+      channels.put(hostaddr, channelBuilder.build());
+      clients.put(hostaddr,
+          OzoneManagerServiceGrpc
+              .newBlockingStub(channels.get(hostaddr)));
+    }
+    int maxFailovers = conf.getInt(
+        OzoneConfigKeys.OZONE_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY,
+        OzoneConfigKeys.OZONE_CLIENT_FAILOVER_MAX_ATTEMPTS_DEFAULT);
+
+    retryPolicy = omFailoverProxyProvider.getRetryPolicy(maxFailovers);
+    LOG.info("{}: started", CLIENT_NAME);
+  }
+
+  @Override
+  public OMResponse submitRequest(OMRequest payload) throws IOException {
+    OMResponse resp = null;
+    boolean tryOtherHost = true;
+    ResultCodes resultCode = ResultCodes.INTERNAL_ERROR;
+    while (tryOtherHost) {
+      tryOtherHost = false;
+      try {
+        resp = clients.get(host.get()).submitRequest(payload);
+      } catch (StatusRuntimeException e) {
+        if (e.getStatus().getCode() == Status.Code.UNAVAILABLE) {
+          resultCode = ResultCodes.TIMEOUT;
+        }
+        Exception exp = new Exception(e);
+        tryOtherHost = shouldRetry(unwrapException(exp));
+        if (!tryOtherHost) {
+          throw new OMException(resultCode);
+        }
+      }
+    }
+    return resp;
+  }
+
+  private Exception unwrapException(Exception ex) {
+    Exception grpcException = null;
+    try {
+      StatusRuntimeException srexp =
+          (StatusRuntimeException)ex.getCause();
+      Status status = srexp.getStatus();
+      LOG.debug("GRPC exception wrapped: {}", status.getDescription());
+      if (status.getCode() == Status.Code.INTERNAL) {
+        // exception potentially generated by OzoneManagerServiceGrpc
+        Class<?> realClass = Class.forName(status.getDescription()
+            .substring(0, status.getDescription()
+                .indexOf(":")));
+        Class<? extends Exception> cls = realClass
+            .asSubclass(Exception.class);
+        Constructor<? extends Exception> cn = cls.getConstructor(String.class);
+        cn.setAccessible(true);
+        grpcException = cn.newInstance(status.getDescription());
+        IOException remote = null;
+        try {
+          String cause = status.getDescription();
+          cause = cause.substring(cause.indexOf(":") + 2);
+          remote = new RemoteException(cause.substring(0, cause.indexOf(":")),
+              cause.substring(cause.indexOf(":") + 1));
+          grpcException.initCause(remote);
+        } catch (Exception e) {
+          LOG.error("cannot get cause for remote exception");
+        }
+      } else {
+        // exception generated by connection failure, gRPC
+        grpcException = ex;
+      }
+    } catch (Exception e) {
+      grpcException = new IOException(e);
+      LOG.error("error unwrapping exception from OMResponse {}");
+    }
+    return grpcException;
+  }
+
+  private boolean shouldRetry(Exception ex) {
+    boolean retry = false;
+    RetryPolicy.RetryAction action = null;
+    try {
+      action = retryPolicy.shouldRetry((Exception)ex, 0, failoverCount++, 
true);
+      LOG.debug("grpc failover retry action {}", action.action);
+      if (action.action == RetryPolicy.RetryAction.RetryDecision.FAIL) {
+        retry = false;
+        LOG.error("Retry request failed. " + action.reason, ex);
+      } else {
+        if (action.action == RetryPolicy.RetryAction.RetryDecision.RETRY ||
+            (action.action == RetryPolicy.RetryAction.RetryDecision
+                .FAILOVER_AND_RETRY)) {
+          if (action.delayMillis > 0) {
+            try {
+              Thread.sleep(action.delayMillis);
+            } catch (Exception e) {
+              LOG.error("Error trying sleep thread for {}", 
action.delayMillis);
+            }
+          }
+          // switch om host to current proxy OMNodeId
+          omFailoverProxyProvider.performFailover(null);
+          host.set(omFailoverProxyProvider
+              .getGrpcProxyAddress(
+                  omFailoverProxyProvider.getCurrentProxyOMNodeId()));
+          retry = true;
+        }
+      }
+    } catch (Exception e) {
+      LOG.error("Failed failover exception {}", e);
+    }
+    return retry;
+  }
+
+  // stub implementation for interface
+  @Override
+  public Text getDelegationTokenService() {
+    return new Text();
+  }
+
+  public void shutdown() {
+    for (Map.Entry<String, ManagedChannel> entry : channels.entrySet()) {
+      ManagedChannel channel = entry.getValue();
+      channel.shutdown();
+      try {
+        channel.awaitTermination(5, TimeUnit.SECONDS);
+      } catch (Exception e) {
+        LOG.error("failed to shutdown OzoneManagerServiceGrpc channel {} : {}",
+            entry.getKey(), e);
+      }
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    shutdown();
+  }
+
+  /**
+   * GrpcOmTransport configuration in Java style configuration class.
+   */
+  @ConfigGroup(prefix = "ozone.om.grpc")
+  public static final class GrpcOmTransportConfig {
+    @Config(key = "port", defaultValue = "8981",

Review Comment:
   We can use one "_**ozone.om.grpc.port**_", however currently it is split 
between the client (`GrpcOmTransportConfig`) and the server 
(_GrpcOzoneManagerServerConfig_).  Using one for the other in the case of the 
_GrpcOzoneManagerServerConfig_ for the client causes import problems, ie. 
_**package org.apache.hadoop.ozone.om cannot be found**_ due to it located in 
the OzoneManager path, unavail to the hadoop-ozone/common.
   
   Can the port be referenced by both the `GrpcOmTransportConfig` and the 
`GrpcOzoneManagerServerConfig` as it is?  If not, if combined where is a good 
package location for the common `Grpc..Config` class?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to