sergey-chugunov-1985 commented on code in PR #12729:
URL: https://github.com/apache/ignite/pull/12729#discussion_r3239803330
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -8232,6 +8479,185 @@ boolean markLastFailedNodeAlive() {
@Override public String toString() {
return S.toString(CrossRingMessageSendState.class, this);
}
+
+ /** */
+ void pingRemoteDCs(List<TcpDiscoveryNode> nodesToPing) {
+ assert !remoteDcPingStarted();
+ assert !F.isEmpty(nodesToPing);
+
+ rmtDcPingMaxTimeNs = System.nanoTime() + (long)((failTimeNanos -
System.nanoTime()) * RMT_DC_PING_TIMEOUT_RATIO);
Review Comment:
```suggestion
rmtDcPingMaxTimeNs = System.nanoTime() +
(U.millisToNanos(spi.getEffectiveConnectionRecoveryTimeout()) *
RMT_DC_PING_TIMEOUT_RATIO;
```
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -830,12 +865,41 @@ private boolean pingNode(TcpDiscoveryNode node) {
long timeout,
boolean logError
) throws IgniteCheckedException {
+ return pingNode(addr, nodeId, clientNodeId, timeout,
spi.getReconnectCount(), 0, logError);
+ }
+
+ /**
+ * Pings the node by its address to see if it's alive.
+ *
+ * @param addr Address of the node.
+ * @param nodeId Node ID to ping. In case when client node ID is not null
this node ID is an ID of the router node.
+ * @param clientNodeId Client node ID.
+ * @param timeout Timeout on operation in milliseconds. If 0, a value
based on {@link TcpDiscoverySpi} is used.
+ * @param reconNum Reconnects number.
+ * @param reconDelayRatio Delay ration compared to {@code timeout} /
{@code reconNum}.
+ * @param logError Boolean flag indicating whether information should be
printed into the node log.
+ * @return ID of the remote node and "client exists" flag if node alive or
{@code null} if the remote node has
+ * left a topology during the ping process.
+ * @throws IgniteCheckedException If an error occurs.
+ */
+ @Nullable private IgniteBiTuple<UUID, Boolean> pingNode(
+ InetSocketAddress addr,
+ @Nullable UUID nodeId,
+ @Nullable UUID clientNodeId,
+ long timeout,
+ int reconNum,
Review Comment:
```suggestion
int reconnectAttempts,
```
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -830,12 +865,41 @@ private boolean pingNode(TcpDiscoveryNode node) {
long timeout,
boolean logError
) throws IgniteCheckedException {
+ return pingNode(addr, nodeId, clientNodeId, timeout,
spi.getReconnectCount(), 0, logError);
+ }
+
+ /**
+ * Pings the node by its address to see if it's alive.
+ *
+ * @param addr Address of the node.
+ * @param nodeId Node ID to ping. In case when client node ID is not null
this node ID is an ID of the router node.
+ * @param clientNodeId Client node ID.
+ * @param timeout Timeout on operation in milliseconds. If 0, a value
based on {@link TcpDiscoverySpi} is used.
+ * @param reconNum Reconnects number.
+ * @param reconDelayRatio Delay ration compared to {@code timeout} /
{@code reconNum}.
Review Comment:
Please rewrite this comment to better explain the meaning of this parameter.
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -830,12 +865,41 @@ private boolean pingNode(TcpDiscoveryNode node) {
long timeout,
boolean logError
) throws IgniteCheckedException {
+ return pingNode(addr, nodeId, clientNodeId, timeout,
spi.getReconnectCount(), 0, logError);
+ }
+
+ /**
+ * Pings the node by its address to see if it's alive.
+ *
+ * @param addr Address of the node.
+ * @param nodeId Node ID to ping. In case when client node ID is not null
this node ID is an ID of the router node.
+ * @param clientNodeId Client node ID.
+ * @param timeout Timeout on operation in milliseconds. If 0, a value
based on {@link TcpDiscoverySpi} is used.
+ * @param reconNum Reconnects number.
+ * @param reconDelayRatio Delay ration compared to {@code timeout} /
{@code reconNum}.
+ * @param logError Boolean flag indicating whether information should be
printed into the node log.
+ * @return ID of the remote node and "client exists" flag if node alive or
{@code null} if the remote node has
+ * left a topology during the ping process.
+ * @throws IgniteCheckedException If an error occurs.
+ */
+ @Nullable private IgniteBiTuple<UUID, Boolean> pingNode(
+ InetSocketAddress addr,
+ @Nullable UUID nodeId,
+ @Nullable UUID clientNodeId,
+ long timeout,
+ int reconNum,
+ float reconDelayRatio,
+ boolean logError
+ ) throws IgniteCheckedException {
+ long timeThreshold = timeout > 0 ? System.nanoTime() +
U.millisToNanos(timeout) : 0;
+
+ assert reconNum > 0;
assert addr != null;
assert timeout >= 0;
+ assert reconDelayRatio >= 0.0f;
- IgniteSpiOperationTimeoutHelper timeoutHelper = timeout == 0
- ? new IgniteSpiOperationTimeoutHelper(spi, clientNodeId == null)
- : new IgniteSpiOperationTimeoutHelper(timeout);
+ long attemptTimeout = (long)(timeout * (1.0f - reconDelayRatio)) /
reconNum;
+ long attemptDelayTiemout = reconNum > 1 ? (long)(timeout *
reconDelayRatio) / (reconNum - 1) : 0;
Review Comment:
```suggestion
long attemptDelayTimeout = reconNum > 1 ? (long)(timeout *
reconDelayRatio) / (reconNum - 1) : 0;
```
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -932,38 +1000,33 @@ private boolean pingNode(TcpDiscoveryNode node) {
reconCnt++;
- if (!openedSock && reconCnt == 2) {
- logPingError(errMsgPrefix + "Was unable to open
the socket at all. " +
- "Cause: " + e.getMessage(), logError);
-
- break;
- }
-
- if
(IgniteSpiOperationTimeoutHelper.checkFailureTimeoutReached(e)
- && (spi.failureDetectionTimeoutEnabled() ||
timeout != 0)) {
+ if ((timeThreshold > 0 && System.nanoTime() >=
timeThreshold) || (spi.failureDetectionTimeoutEnabled()
+ &&
IgniteSpiOperationTimeoutHelper.checkFailureTimeoutReached(e))) {
logPingError(errMsgPrefix + "Reached the timeout "
+
(timeout == 0 ? spi.failureDetectionTimeout()
: timeout) +
"ms. Cause: " + e.getMessage(), logError);
break;
}
- else if (!spi.failureDetectionTimeoutEnabled() &&
reconCnt == spi.getReconnectCount()) {
- logPingError(errMsgPrefix + "Reached the
reconnection count spi.getReconnectCount(). " +
- "Cause: " + e.getMessage(), logError);
+ else if (reconCnt >= reconNum) {
+ logPingError(errMsgPrefix + "Reached the
reconnection count " + reconNum + ". Cause: "
Review Comment:
```suggestion
logPingError(errMsgPrefix + "Max reconnect
attempts have been reached: " + reconNum + ". Cause: "
```
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -830,12 +865,41 @@ private boolean pingNode(TcpDiscoveryNode node) {
long timeout,
boolean logError
) throws IgniteCheckedException {
+ return pingNode(addr, nodeId, clientNodeId, timeout,
spi.getReconnectCount(), 0, logError);
+ }
+
+ /**
+ * Pings the node by its address to see if it's alive.
+ *
+ * @param addr Address of the node.
+ * @param nodeId Node ID to ping. In case when client node ID is not null
this node ID is an ID of the router node.
+ * @param clientNodeId Client node ID.
+ * @param timeout Timeout on operation in milliseconds. If 0, a value
based on {@link TcpDiscoverySpi} is used.
+ * @param reconNum Reconnects number.
+ * @param reconDelayRatio Delay ration compared to {@code timeout} /
{@code reconNum}.
+ * @param logError Boolean flag indicating whether information should be
printed into the node log.
+ * @return ID of the remote node and "client exists" flag if node alive or
{@code null} if the remote node has
+ * left a topology during the ping process.
+ * @throws IgniteCheckedException If an error occurs.
+ */
+ @Nullable private IgniteBiTuple<UUID, Boolean> pingNode(
+ InetSocketAddress addr,
+ @Nullable UUID nodeId,
+ @Nullable UUID clientNodeId,
+ long timeout,
+ int reconNum,
+ float reconDelayRatio,
+ boolean logError
+ ) throws IgniteCheckedException {
+ long timeThreshold = timeout > 0 ? System.nanoTime() +
U.millisToNanos(timeout) : 0;
Review Comment:
```suggestion
long timeoutThreshold = timeout > 0 ? System.nanoTime() +
U.millisToNanos(timeout) : 0;
```
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -3444,11 +3512,13 @@ else if (log.isTraceEnabled())
// Handshake.
TcpDiscoveryHandshakeRequest hndMsg = new
TcpDiscoveryHandshakeRequest(locNodeId);
- // Topology treated as changes if next node is
not available.
- boolean changeTop = sndState != null &&
!sndState.isStartingPoint();
-
- if (changeTop)
-
hndMsg.previousNodeId(ring.previousNodeOf(next).id());
+ // If want a forced connection, we set the
change-topology node flag to current node id.
Review Comment:
Please add more details about forced connection to this comment
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -3802,27 +3864,39 @@ else if (e instanceof SocketTimeoutException ||
if (!sent) {
if (sndState == null &&
spi.getEffectiveConnectionRecoveryTimeout() > 0)
- sndState = new CrossRingMessageSendState();
- else if (sndState != null && sndState.checkTimeout()) {
- segmentLocalNodeOnSendFail(failedNodes);
-
+ sndState = createConnectionRecoveryState(newNext);
+ else if (sndState != null &&
checkConnectionRecoveryFailed(sndState, failedNodes))
return; // Nothing to do here.
- }
- boolean failedNextNode = sndState == null ||
sndState.markNextNodeFailed();
+ // The next node might be reset as result of the parallel
remote DC ping process.
Review Comment:
```suggestion
// The next node might be reset as a result of the
parallel remote DC ping process.
```
##########
modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java:
##########
@@ -3802,27 +3864,39 @@ else if (e instanceof SocketTimeoutException ||
if (!sent) {
if (sndState == null &&
spi.getEffectiveConnectionRecoveryTimeout() > 0)
- sndState = new CrossRingMessageSendState();
- else if (sndState != null && sndState.checkTimeout()) {
- segmentLocalNodeOnSendFail(failedNodes);
-
+ sndState = createConnectionRecoveryState(newNext);
+ else if (sndState != null &&
checkConnectionRecoveryFailed(sndState, failedNodes))
return; // Nothing to do here.
- }
- boolean failedNextNode = sndState == null ||
sndState.markNextNodeFailed();
+ // The next node might be reset as result of the parallel
remote DC ping process.
+ boolean failedNextNode = next != null && (sndState == null
|| sndState.markNextNodeFailed());
if (failedNextNode && !failedNodes.contains(next)) {
failedNodes.add(next);
+ TcpDiscoveryNode next0 = next;
+
+ if (allRemoteDCsTraversed(sndState, failedNodes,
next)) {
+ if (log.isInfoEnabled()) {
+ log.info("During the connection recovery, all
the remote DCs have been traversed. " +
Review Comment:
Please use this phrase instead: "Connection recovery finished: all remote
DCs traversed, none available. Current node will skip failed DCs. Ring
connection recovery time remaining:"
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]