This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new e755d64e62f [feature](be jvm monitor)append enable_jvm_monitor in
be.conf to control jvm monitor. (#35608) (#35764)
e755d64e62f is described below
commit e755d64e62fa6c315c4a833a7dff62a59e2bce66
Author: Mingyu Chen <[email protected]>
AuthorDate: Sun Jun 2 00:18:44 2024 +0800
[feature](be jvm monitor)append enable_jvm_monitor in be.conf to control
jvm monitor. (#35608) (#35764)
bp #35608
Co-authored-by: daidai <[email protected]>
---
be/src/common/config.cpp | 3 +
be/src/common/config.h | 3 +
be/src/util/jvm_metrics.cpp | 112 ++++++++++++++++++++-----
be/src/util/jvm_metrics.h | 16 ++--
regression-test/pipeline/external/conf/be.conf | 3 +
regression-test/pipeline/p0/conf/be.conf | 3 +
regression-test/pipeline/p1/conf/be.conf | 5 +-
7 files changed, 116 insertions(+), 29 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 8d8cb8f222e..6d0c866cebe 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1222,6 +1222,9 @@ DEFINE_mInt32(thrift_client_open_num_tries, "1");
DEFINE_mBool(ignore_schema_change_check, "false");
+//JVM monitoring enable. To prevent be from crashing due to jvm compatibility
issues. The default setting is off.
+DEFINE_Bool(enable_jvm_monitor, "false");
+
// clang-format off
#ifdef BE_TEST
// test s3
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 0334cb085c6..b172a3406d6 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1303,6 +1303,9 @@ DECLARE_mInt32(thrift_client_open_num_tries);
DECLARE_mBool(ignore_schema_change_check);
+//JVM monitoring enable. To prevent be from crashing due to jvm compatibility
issues.
+DECLARE_Bool(enable_jvm_monitor);
+
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);
diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp
index e55cf8f3fbe..fc30d1073ac 100644
--- a/be/src/util/jvm_metrics.cpp
+++ b/be/src/util/jvm_metrics.cpp
@@ -17,10 +17,12 @@
#include "jvm_metrics.h"
+#include <util/jni-util.h>
+
#include <functional>
+#include "common/config.h"
#include "util/metrics.h"
-
namespace doris {
#define DEFINE_JVM_SIZE_BYTES_METRIC(name, type)
\
@@ -76,15 +78,28 @@
DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(jvm_gc_g1_old_generation_time_ms, MetricUni
const char* JvmMetrics::_s_hook_name = "jvm_metrics";
-JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) :
_jvm_stats(env) {
+JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) {
DCHECK(registry != nullptr);
_registry = registry;
_server_entity = _registry->register_entity("server");
DCHECK(_server_entity != nullptr);
- if (_jvm_stats.init_complete()) {
+
+ do {
+ if (!doris::config::enable_jvm_monitor) {
+ break;
+ }
+ try {
+ _jvm_stats.init(env);
+ } catch (...) {
+ LOG(WARNING) << "JVM STATS INIT FAIL";
+ break;
+ }
+ if (!_jvm_stats.init_complete()) {
+ break;
+ }
_server_entity->register_hook(_s_hook_name,
std::bind(&JvmMetrics::update, this));
- }
+ } while (false);
INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_max);
INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_committed);
@@ -117,11 +132,58 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv*
env) : _jvm_stats(env)
}
void JvmMetrics::update() {
- _jvm_stats.refresh(this);
+ static long fail_count = 0;
+ bool have_exception = false;
+ try {
+ _jvm_stats.refresh(this);
+ } catch (...) {
+ have_exception = true;
+ LOG(WARNING) << "JVM MONITOR UPDATE FAIL!";
+ fail_count++;
+ }
+
+ //When 30 consecutive exceptions occur, turn off jvm information
collection.
+ if (!have_exception) {
+ fail_count = 0;
+ }
+ if (fail_count >= 30) {
+ LOG(WARNING) << "JVM MONITOR CLOSE!";
+ _jvm_stats.set_complete(false);
+ _server_entity->deregister_hook(_s_hook_name);
+
+ jvm_heap_size_bytes_max->set_value(0);
+ jvm_heap_size_bytes_committed->set_value(0);
+ jvm_heap_size_bytes_used->set_value(0);
+
+ jvm_non_heap_size_bytes_used->set_value(0);
+ jvm_non_heap_size_bytes_committed->set_value(0);
+
+ jvm_young_size_bytes_used->set_value(0);
+ jvm_young_size_bytes_peak_used->set_value(0);
+ jvm_young_size_bytes_max->set_value(0);
+
+ jvm_old_size_bytes_used->set_value(0);
+ jvm_old_size_bytes_peak_used->set_value(0);
+ jvm_old_size_bytes_max->set_value(0);
+
+ jvm_thread_count->set_value(0);
+ jvm_thread_peak_count->set_value(0);
+ jvm_thread_new_count->set_value(0);
+ jvm_thread_runnable_count->set_value(0);
+ jvm_thread_blocked_count->set_value(0);
+ jvm_thread_waiting_count->set_value(0);
+ jvm_thread_timed_waiting_count->set_value(0);
+ jvm_thread_terminated_count->set_value(0);
+
+ jvm_gc_g1_young_generation_count->set_value(0);
+ jvm_gc_g1_young_generation_time_ms->set_value(0);
+ jvm_gc_g1_old_generation_count->set_value(0);
+ jvm_gc_g1_old_generation_time_ms->set_value(0);
+ }
}
-#include <util/jni-util.h>
-jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) {
+void JvmStats::init(JNIEnv* ENV) {
+ env = ENV;
_managementFactoryClass =
env->FindClass("java/lang/management/ManagementFactory");
if (_managementFactoryClass == nullptr) {
LOG(WARNING)
@@ -244,15 +306,19 @@ jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) {
LOG(INFO) << "Start JVM monitoring.";
_init_complete = true;
+ return;
}
-#include "jni.h"
-
-void jvmStats::refresh(JvmMetrics* jvm_metrics) {
+void JvmStats::refresh(JvmMetrics* jvm_metrics) {
if (!_init_complete) {
return;
}
- static_cast<void>(JniUtil::GetJNIEnv(&env));
+
+ Status st = JniUtil::GetJNIEnv(&env);
+ if (!st.ok()) {
+ LOG(WARNING) << "JVM STATS GET JNI ENV FAIL";
+ return;
+ }
jobject memoryMXBeanObj =
env->CallStaticObjectMethod(_managementFactoryClass,
_getMemoryMXBeanMethod);
@@ -302,8 +368,8 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) {
jstring name =
(jstring)env->CallObjectMethod(memoryPoolMXBean,
_getMemoryPollMXBeanNameMethod);
- const char* nameStr = env->GetStringUTFChars(name, NULL);
- if (nameStr != NULL) {
+ const char* nameStr = env->GetStringUTFChars(name, nullptr);
+ if (nameStr != nullptr) {
auto it = _memoryPoolName.find(nameStr);
if (it == _memoryPoolName.end()) {
continue;
@@ -408,16 +474,22 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) {
env->DeleteLocalRef(threadMXBean);
env->DeleteLocalRef(gcMXBeansList);
}
-jvmStats::~jvmStats() {
+JvmStats::~JvmStats() {
if (!_init_complete) {
return;
}
- env->DeleteLocalRef(_newThreadStateObj);
- env->DeleteLocalRef(_runnableThreadStateObj);
- env->DeleteLocalRef(_blockedThreadStateObj);
- env->DeleteLocalRef(_waitingThreadStateObj);
- env->DeleteLocalRef(_timedWaitingThreadStateObj);
- env->DeleteLocalRef(_terminatedThreadStateObj);
+ try {
+ env->DeleteLocalRef(_newThreadStateObj);
+ env->DeleteLocalRef(_runnableThreadStateObj);
+ env->DeleteLocalRef(_blockedThreadStateObj);
+ env->DeleteLocalRef(_waitingThreadStateObj);
+ env->DeleteLocalRef(_timedWaitingThreadStateObj);
+ env->DeleteLocalRef(_terminatedThreadStateObj);
+
+ } catch (...) {
+ // When be is killed, DeleteLocalRef may fail.
+ // In order to exit more gracefully, we catch the exception here.
+ }
}
} // namespace doris
diff --git a/be/src/util/jvm_metrics.h b/be/src/util/jvm_metrics.h
index 5f9929d8cf0..459a3cbf938 100644
--- a/be/src/util/jvm_metrics.h
+++ b/be/src/util/jvm_metrics.h
@@ -17,8 +17,6 @@
#pragma once
-#include <jni.h>
-
#include "jni.h"
#include "util/jni-util.h"
#include "util/metrics.h"
@@ -27,7 +25,7 @@ namespace doris {
class JvmMetrics;
-class jvmStats {
+class JvmStats {
private:
JNIEnv* env = nullptr;
jclass _managementFactoryClass = nullptr;
@@ -98,16 +96,18 @@ private:
bool _init_complete = false;
public:
- jvmStats(JNIEnv* ENV);
- bool init_complete() { return _init_complete; }
+ // JvmStats(JNIEnv* ENV);
+ void init(JNIEnv* ENV);
+ bool init_complete() const { return _init_complete; }
+ void set_complete(bool val) { _init_complete = val; }
void refresh(JvmMetrics* jvm_metrics);
- ~jvmStats();
+ ~JvmStats();
};
class JvmMetrics {
public:
JvmMetrics(MetricRegistry* registry, JNIEnv* env);
- ~JvmMetrics() {}
+ ~JvmMetrics() = default;
void update();
IntGauge* jvm_heap_size_bytes_max = nullptr;
@@ -140,7 +140,7 @@ public:
IntGauge* jvm_gc_g1_old_generation_time_ms = nullptr;
private:
- jvmStats _jvm_stats;
+ JvmStats _jvm_stats;
std::shared_ptr<MetricEntity> _server_entity;
static const char* _s_hook_name;
MetricRegistry* _registry = nullptr;
diff --git a/regression-test/pipeline/external/conf/be.conf
b/regression-test/pipeline/external/conf/be.conf
index 9a5b3641b84..2bd810e55cc 100644
--- a/regression-test/pipeline/external/conf/be.conf
+++ b/regression-test/pipeline/external/conf/be.conf
@@ -70,3 +70,6 @@ fragment_pool_thread_num_max=5000
enable_fuzzy_mode=true
enable_set_in_bitmap_value=true
enable_feature_binlog=true
+
+enable_jvm_monitor = true
+
diff --git a/regression-test/pipeline/p0/conf/be.conf
b/regression-test/pipeline/p0/conf/be.conf
index 15f19ec4f42..b5d6944acae 100644
--- a/regression-test/pipeline/p0/conf/be.conf
+++ b/regression-test/pipeline/p0/conf/be.conf
@@ -82,3 +82,6 @@ user_files_secure_path=/
enable_debug_points=true
# debug scanner context dead loop
enable_debug_log_timeout_secs=0
+
+enable_jvm_monitor = true
+
diff --git a/regression-test/pipeline/p1/conf/be.conf
b/regression-test/pipeline/p1/conf/be.conf
index e1ae9653c78..0c450c9281e 100644
--- a/regression-test/pipeline/p1/conf/be.conf
+++ b/regression-test/pipeline/p1/conf/be.conf
@@ -71,4 +71,7 @@ fragment_pool_thread_num_max=5000
enable_fuzzy_mode=true
enable_set_in_bitmap_value=true
enable_feature_binlog=true
-max_sys_mem_available_low_water_mark_bytes=69206016
\ No newline at end of file
+max_sys_mem_available_low_water_mark_bytes=69206016
+
+enable_jvm_monitor = true
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]