This is an automated email from the ASF dual-hosted git repository. rombert pushed a commit to annotated tag org.apache.sling.discovery.oak-1.2.0 in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-discovery-oak.git
commit 5808792743806cd9fa182b84727da2ac4dcaeb7d Author: Stefan Egli <[email protected]> AuthorDate: Tue Nov 24 15:46:37 2015 +0000 SLING-5326 : adding time-difference health-check 'ClocksInSyncHealthCheck' git-svn-id: https://svn.apache.org/repos/asf/sling/trunk/bundles/extensions/discovery/oak@1716181 13f79535-47bb-0310-9956-ffa450edef68 --- pom.xml | 8 +- .../discovery/oak/ClocksInSyncHealthCheck.java | 186 +++++++++++++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8fcb059..085c5be 100644 --- a/pom.xml +++ b/pom.xml @@ -29,7 +29,7 @@ <artifactId>org.apache.sling.discovery.oak</artifactId> <packaging>bundle</packaging> - <version>1.1.1-SNAPSHOT</version> + <version>1.2.0-SNAPSHOT</version> <name>Apache Sling Oak-Based Discovery Service</name> <description>Implementation of Apache Sling Discovery based on Jackrabbit Oak using its discovery-lite descriptor for in-cluster view detection and a TopologyView through HTTP POST heartbeats announcing sub-topologies to each other.</description> @@ -319,5 +319,11 @@ <version>1.3.7</version> <scope>test</scope> </dependency> + <dependency> + <groupId>org.apache.sling</groupId> + <artifactId>org.apache.sling.hc.core</artifactId> + <version>1.0.6</version> + <type>bundle</type> + </dependency> </dependencies> </project> diff --git a/src/main/java/org/apache/sling/discovery/oak/ClocksInSyncHealthCheck.java b/src/main/java/org/apache/sling/discovery/oak/ClocksInSyncHealthCheck.java new file mode 100644 index 0000000..900428f --- /dev/null +++ b/src/main/java/org/apache/sling/discovery/oak/ClocksInSyncHealthCheck.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sling.discovery.oak; + +import java.lang.management.ManagementFactory; +import java.util.Collection; +import java.util.Set; + +import javax.management.MBeanServer; +import javax.management.ObjectName; + +import org.apache.felix.scr.annotations.Component; +import org.apache.felix.scr.annotations.Properties; +import org.apache.felix.scr.annotations.Property; +import org.apache.felix.scr.annotations.PropertyUnbounded; +import org.apache.felix.scr.annotations.Reference; +import org.apache.felix.scr.annotations.Service; +import org.apache.sling.discovery.base.connectors.announcement.Announcement; +import org.apache.sling.discovery.base.connectors.announcement.AnnouncementRegistry; +import org.apache.sling.discovery.base.connectors.announcement.CachedAnnouncement; +import org.apache.sling.hc.api.HealthCheck; +import org.apache.sling.hc.api.Result; +import org.apache.sling.hc.util.FormattingResultLog; +import org.apache.sling.settings.SlingSettingsService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * HealthCheck that builds on-top of DocumentNodeStore's + * determineServerTimeDifferenceMillis method which checks how much the local + * time differs from the DocumentStore's time. It then applies low- and + * high-water marks to that time difference: + * <ul> + * <li>if the value is higher than the high-water mark (5sec by default), then + * it issues a critical</li> + * <li>if the value is lower than the high-water but higher than the low-water + * mark (1sec by default), then it issues only a warn</li> + * <li>if the value is lower than the low-water mark, then it issues only an + * info</li> + * </ul> + */ +@Component(immediate = true, metatype = true, label = "Apache Sling Discovery Oak Clocks-In-Sync Health Check") +@Properties({ + @Property(name = HealthCheck.NAME, value = "SlingDiscoveryOakClocksInSyncHC", description = "Health Check name", label = "Name"), + @Property(name = HealthCheck.TAGS, unbounded = PropertyUnbounded.ARRAY, description = "Health Check tags", label = "Tags"), + @Property(name = HealthCheck.MBEAN_NAME, value = "slingDiscoveryOakClocksInSync", description = "Health Check MBean name", label = "MBean name") }) +@Service(value = HealthCheck.class) +public class ClocksInSyncHealthCheck implements HealthCheck { + + protected final Logger logger = LoggerFactory.getLogger(getClass()); + + private static final String DOCUMENT_NODE_STORE_MBEAN = "org.apache.jackrabbit.oak:name=*,type=\"DocumentNodeStore\",id=*"; + private static final String TIME_DIFF_METHOD_NAME = "determineServerTimeDifferenceMillis"; + + private static final long INTRA_CLUSTER_HIGH_WATER_MARK = 5000; + private static final long INTRA_CLUSTER_LOW_WATER_MARK = 1000; + + private static final long INTER_CLUSTER_HIGH_WATER_MARK = 10000; + private static final long INTER_CLUSTER_LOW_WATER_MARK = 5000; + + @Reference + private AnnouncementRegistry announcementRegistry; + + @Reference + private SlingSettingsService settingsService; + + @Override + public Result execute() { + final FormattingResultLog resultLog = new FormattingResultLog(); + resultLog.debug("Checking cluster internal clocks"); + try { + final MBeanServer jmxServer = ManagementFactory.getPlatformMBeanServer(); + ObjectName n = new ObjectName(DOCUMENT_NODE_STORE_MBEAN); + Set<ObjectName> names = jmxServer.queryNames(n, null); + + if (names.size() == 0) { + resultLog.info("Intra-cluster test n/a (No DocumentNodeStore MBean found)"); + } else { + ObjectName firstName = names.iterator().next(); + final Object value = jmxServer.invoke(firstName, TIME_DIFF_METHOD_NAME, new Object[0], new String[0]); + logger.debug("{} returns {}", new Object[] { firstName, TIME_DIFF_METHOD_NAME, value }); + resultLog.debug("{} returns {}", firstName, TIME_DIFF_METHOD_NAME, value); + if (value != null && (value instanceof Long)) { + Long diffMillis = (Long) value; + if (Math.abs(diffMillis) >= INTRA_CLUSTER_HIGH_WATER_MARK) { + logger.warn( + "execute: clocks in local cluster out of sync by {}ms " + + "which is equal or higher than the high-water mark of {}ms.", + diffMillis, INTRA_CLUSTER_HIGH_WATER_MARK); + resultLog.critical( + "Clocks heavily out of sync in local cluster: " + + "time difference of this VM with DocumentStore server: " + + "{}ms is equal or larger than high-water mark of {}ms", + diffMillis, INTRA_CLUSTER_HIGH_WATER_MARK); + } else if (Math.abs(diffMillis) >= INTRA_CLUSTER_LOW_WATER_MARK) { + logger.warn( + "execute: clocks in local cluster out of sync by {}ms" + + "ms which is equal or higher than the low-water mark of {}ms.", + diffMillis, INTRA_CLUSTER_LOW_WATER_MARK); + resultLog.warn( + "Clocks noticeably out of sync in local cluster: " + + "time difference of this VM with DocumentStore server: " + + "{}ms is equal or larger than low-water mark of {}ms", + diffMillis, INTRA_CLUSTER_LOW_WATER_MARK); + } else { + logger.debug("execute: clocks in local cluster in sync. diff is {}ms" + + "ms which is within low-water mark of {}ms.", diffMillis, INTRA_CLUSTER_LOW_WATER_MARK); + resultLog.info("Clocks in sync in local cluster: time difference of this VM with DocumentStore server: " + + "{}ms is within low-water mark of {}ms", diffMillis, INTRA_CLUSTER_LOW_WATER_MARK); + } + } + } + } catch (final Exception e) { + logger.warn("execute: {}, JMX method {} invocation failed: {}", + new Object[] { DOCUMENT_NODE_STORE_MBEAN, TIME_DIFF_METHOD_NAME, e }); + resultLog.healthCheckError("{}, JMX method {} invocation failed: {}", DOCUMENT_NODE_STORE_MBEAN, TIME_DIFF_METHOD_NAME, + e); + } + + final String slingId = settingsService == null ? "n/a" : settingsService.getSlingId(); + + if (announcementRegistry == null) { + logger.warn("execute: no announcementRegistry ({}) set", announcementRegistry); + resultLog.warn("Cannot determine topology clocks since no announcementRegistry ({}) set", announcementRegistry); + } else { + final Collection<Announcement> localAnnouncements = announcementRegistry.listLocalAnnouncements(); + if (localAnnouncements.isEmpty()) { + logger.info("execute: no topology connectors connected to local instance."); + resultLog.info("No topology connectors connected to local instance."); + } + for (Announcement ann : localAnnouncements) { + final String peerSlingId = ann.isInherited() ? ann.getServerInfo() : ann.getOwnerId(); + final long originallyCreatedAt = ann.getOriginallyCreatedAt(); + final long receivedAt = ann.getReceivedAt(); + long diffMillis = Math.abs(originallyCreatedAt - receivedAt); + if (Math.abs(diffMillis) >= INTER_CLUSTER_HIGH_WATER_MARK) { + logger.warn( + "execute: clocks between local instance (slingId: {}) and remote instance (slingId: {}) out of sync by {}ms" + + "ms which is equal or higher than the high-water mark of {}ms.", + new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK }); + resultLog.critical( + "Clocks heavily out of sync between local instance (slingId: {}) and remote instance (slingId: {}): " + + "by {}ms which is equal or larger than high-water mark of {}ms", + new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK }); + } else if (Math.abs(diffMillis) >= INTER_CLUSTER_LOW_WATER_MARK) { + logger.warn( + "execute: clocks out of sync between local instance (slingId: {}) and remote instance (slingId: {}) by {}ms " + + "ms which is equal or higher than the low-water mark of {}ms.", + new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK }); + resultLog.warn( + "Clocks noticeably out of sync between local instance (slingId: {}) and remote instance (slingId: {}): " + + "by {}ms which is equal or larger than low-water mark of {}ms", + new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK }); + } else { + logger.debug( + "execute: clocks in sync between local instance (slingId: {}) and remote instance (slingId: {}). " + + "diff is {}ms which is within low-water mark of {}ms.", + new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK }); + resultLog.info( + "Clocks in sync between local instance (slingId: {}) and remote instance (slingId: {}): " + + "diff is {}ms which is within low-water mark of {}ms", + new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK }); + } + } + } + + return new Result(resultLog); + } + +} -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
