This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push: new e65ea38a4 [clock] output more info in SystemNtp::DumpDiagnostics() e65ea38a4 is described below commit e65ea38a4860c007d93ada9c991bccec903a80b1 Author: Alexey Serbin <ale...@apache.org> AuthorDate: Sun Feb 5 22:11:14 2023 -0800 [clock] output more info in SystemNtp::DumpDiagnostics() This patch adds more details to the output of the DumpDiagnostics() method for the system clock synchronized with NTP. In particular, the output now contains relevant metrics for the hybrid clock, the output of the 'ntpstat' utility (if present), the output of 'chronyc activity' (if the 'chronyc' CLI is present), and a snapshot of the newly introduced 'clock_ntp_status' metric. As already mentioned, this patch also adds a new 'clock_ntp_status' metric that provides information on ntp_gettime()/ntp_adjtime() call result. It's a string containing current timestamp (usec), max error (usec), and the result status of the call. The corresponding HybridClockTest.TestNtpDiagnostics scenario has been updated correspondingly. Change-Id: Ibb74e81ba1e4bc4c60acc6bd49bb458085e28207 Reviewed-on: http://gerrit.cloudera.org:8080/19478 Tested-by: Kudu Jenkins Reviewed-by: Ashwani Raina <ara...@cloudera.com> Reviewed-by: Yifan Zhang <chinazhangyi...@163.com> --- src/kudu/clock/hybrid_clock-test.cc | 9 +++++ src/kudu/clock/hybrid_clock.cc | 2 +- src/kudu/clock/system_ntp.cc | 75 ++++++++++++++++++++++++++++++++++++- src/kudu/clock/system_ntp.h | 17 ++++++++- 4 files changed, 99 insertions(+), 4 deletions(-) diff --git a/src/kudu/clock/hybrid_clock-test.cc b/src/kudu/clock/hybrid_clock-test.cc index 80588a723..4c9ec5ee2 100644 --- a/src/kudu/clock/hybrid_clock-test.cc +++ b/src/kudu/clock/hybrid_clock-test.cc @@ -577,10 +577,19 @@ TEST_F(HybridClockTest, TestNtpDiagnostics) { // error messages that corresponding binaries cannot be found. In either case // it proves the tools were attempted to run to collect NTP-related diagnostic // information. + ASSERT_STR_CONTAINS(s, "ntpstat"); ASSERT_STR_CONTAINS(s, "ntptime"); ASSERT_STR_CONTAINS(s, "ntpq"); ASSERT_STR_CONTAINS(s, "ntpdc"); ASSERT_STR_CONTAINS(s, "chronyc"); + + // Verify the output contains relevant metrics. + ASSERT_STR_CONTAINS(s, "clock_ntp_status"); + ASSERT_STR_CONTAINS(s, "hybrid_clock_extrapolating"); + ASSERT_STR_CONTAINS(s, "hybrid_clock_error"); + ASSERT_STR_CONTAINS(s, "hybrid_clock_timestamp"); + ASSERT_STR_CONTAINS(s, "hybrid_clock_max_errors"); + ASSERT_STR_CONTAINS(s, "hybrid_clock_extrapolation_intervals"); } #endif // #if defined(KUDU_HAS_SYSTEM_TIME_SOURCE) ... diff --git a/src/kudu/clock/hybrid_clock.cc b/src/kudu/clock/hybrid_clock.cc index d217c07ed..8c64c23be 100644 --- a/src/kudu/clock/hybrid_clock.cc +++ b/src/kudu/clock/hybrid_clock.cc @@ -542,7 +542,7 @@ Status HybridClock::InitWithTimeSource(TimeSource time_source) { break; #if defined(KUDU_HAS_SYSTEM_TIME_SOURCE) case TimeSource::NTP_SYNC_SYSTEM: - time_service_.reset(new clock::SystemNtp); + time_service_.reset(new clock::SystemNtp(metric_entity_)); break; #endif case TimeSource::UNSYNC_SYSTEM: diff --git a/src/kudu/clock/system_ntp.cc b/src/kudu/clock/system_ntp.cc index 78d7e9b42..4c2debbc7 100644 --- a/src/kudu/clock/system_ntp.cc +++ b/src/kudu/clock/system_ntp.cc @@ -21,6 +21,7 @@ #include <sys/timex.h> #include <cerrno> +#include <functional> #include <limits> #include <ostream> #include <string> @@ -33,7 +34,9 @@ #include "kudu/gutil/strings/join.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/util/errno.h" +#include "kudu/util/jsonwriter.h" #include "kudu/util/logging.h" +#include "kudu/util/metrics.h" #include "kudu/util/path_util.h" #include "kudu/util/status.h" #include "kudu/util/subprocess.h" @@ -44,6 +47,11 @@ using std::string; using std::vector; using strings::Substitute; +METRIC_DEFINE_gauge_string(server, clock_ntp_status, "Clock NTP Status String", + kudu::MetricUnit::kState, + "Output of ntp_adjtime()/ntp_gettime() kernel API call", + kudu::MetricLevel::kDebug); + namespace kudu { namespace clock { @@ -114,8 +122,9 @@ void TryRun(vector<string> cmd, vector<string>* log) { } // anonymous namespace -SystemNtp::SystemNtp() - : skew_ppm_(std::numeric_limits<int64_t>::max()) { +SystemNtp::SystemNtp(const scoped_refptr<MetricEntity>& metric_entity) + : skew_ppm_(std::numeric_limits<int64_t>::max()), + metric_entity_(metric_entity) { } Status SystemNtp::Init() { @@ -131,6 +140,10 @@ Status SystemNtp::Init() { skew_ppm_ = t.tolerance >> 16; VLOG(1) << "ntp_adjtime(): tolerance is " << t.tolerance; + METRIC_clock_ntp_status.InstantiateFunctionGauge( + metric_entity_, []() { return ClockNtpStatusForMetrics(); })-> + AutoDetachToLastValue(&metric_detacher_); + return Status::OK(); } @@ -164,6 +177,8 @@ Status SystemNtp::WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec) { void SystemNtp::DumpDiagnostics(vector<string>* log) const { LOG_STRING(ERROR, log) << "Dumping NTP diagnostics"; + + TryRun({"ntpstat"}, log); TryRun({"ntptime"}, log); // Gather as much info as possible from both ntpq and ntpdc, even // though some of it might be redundant. Different versions of ntp @@ -185,6 +200,62 @@ void SystemNtp::DumpDiagnostics(vector<string>* log) const { TryRun({"chronyc", "-n", "tracking"}, log); TryRun({"chronyc", "-n", "sources"}, log); + TryRun({"chronyc", "-n", "activity"}, log); + + MetricJsonOptions opts; + // There are relevant histogram metrics such as 'hybrid_clock_max_errors' and + // 'hybrid_clock_extrapolation_intervals' to output. + opts.include_raw_histograms = true; + // No need to output attributes -- that information could be retrieved from + // elsewhere, if needed. + opts.include_entity_attributes = false; + opts.filters.entity_metrics.emplace_back("clock_ntp_status"); + opts.filters.entity_metrics.emplace_back("hybrid_clock_extrapolating"); + opts.filters.entity_metrics.emplace_back("hybrid_clock_error"); + opts.filters.entity_metrics.emplace_back("hybrid_clock_timestamp"); + opts.filters.entity_metrics.emplace_back("hybrid_clock_max_errors"); + opts.filters.entity_metrics.emplace_back("hybrid_clock_extrapolation_intervals"); + + // Collect the metrics in JSON pretty-printed format. + std::ostringstream buf; + JsonWriter writer(&buf, JsonWriter::PRETTY); + if (auto s = metric_entity_->WriteAsJson(&writer, opts); !s.ok()) { + LOG_STRING(WARNING, log) << "failed to dump relevant clock metrics: " << s.ToString(); + } else { + LOG_STRING(ERROR, log) << buf.str(); + } +} + +string SystemNtp::ClockNtpStatusForMetrics() { + static constexpr const char* const kFmt = "now:$0 maxerror:$1 status:$2"; + +#ifdef __APPLE__ + ntptimeval t; + if (const auto s = NtpStateToStatus(ntp_gettime(&t)); !s.ok()) { + return Substitute(kFmt, "n/a", "n/a", s.ToString()); + } + const uint64_t now_usec = static_cast<uint64_t>(t.time.tv_sec) * 1000000 + + t.time.tv_nsec / 1000; +#else + timex t; + t.modes = 0; // set mode to 0 for read-only query + const int rc = ntp_adjtime(&t); + + if (const auto s = CheckForNtpAdjtimeError(rc); !s.ok()) { + return Substitute(kFmt, "n/a", "n/a", s.ToString()); + } + if (const auto s = NtpStateToStatus(rc); !s.ok()) { + return Substitute(kFmt, "n/a", "n/a", s.ToString()); + } + + if (t.status & STA_NANO) { + t.time.tv_usec /= 1000; + } + const uint64_t now_usec = static_cast<uint64_t>(t.time.tv_sec) * 1000000 + + t.time.tv_usec; +#endif // #ifdef __APPLE__ ... #else ... + + return Substitute(kFmt, now_usec, t.maxerror, "ok"); } } // namespace clock diff --git a/src/kudu/clock/system_ntp.h b/src/kudu/clock/system_ntp.h index 90ec93ac5..b79314d12 100644 --- a/src/kudu/clock/system_ntp.h +++ b/src/kudu/clock/system_ntp.h @@ -22,6 +22,8 @@ #include "kudu/clock/time_service.h" #include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/metrics.h" #include "kudu/util/status.h" namespace kudu { @@ -35,7 +37,7 @@ namespace clock { // to keep the kernel's timekeeping up to date and in sync. class SystemNtp : public TimeService { public: - SystemNtp(); + explicit SystemNtp(const scoped_refptr<MetricEntity>& metric_entity); Status Init() override; @@ -48,10 +50,23 @@ class SystemNtp : public TimeService { void DumpDiagnostics(std::vector<std::string>* log) const override; private: + // Returns current timestamp, maxerror, and status as returned by the + // invocation of ntp_gettime()/ntp_adjtime() NTP kernel API call. + static std::string ClockNtpStatusForMetrics(); + // The maximum possible clock frequency skew rate reported by the kernel, // parts-per-million (PPM). int64_t skew_ppm_; + // Metric entity. Used to fetch information on NTP-related metrics upon + // calling DumpDiagnostics(). + scoped_refptr<MetricEntity> metric_entity_; + + // Metrics are set to detach to their last value. This means that, during + // running the destructor, there might be a need to access other class members + // declared above. Hence, this member must be declared last. + FunctionGaugeDetacher metric_detacher_; + DISALLOW_COPY_AND_ASSIGN(SystemNtp); };