This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 5368bb19b07 [feature](be jvm monitor)append enable_jvm_monitor in
be.conf to control jvm monitor. (#35608)
5368bb19b07 is described below
commit 5368bb19b072791bbc6f33c0f814a281ce0f8f83
Author: daidai <[email protected]>
AuthorDate: Sat Jun 1 10:14:51 2024 +0800
[feature](be jvm monitor)append enable_jvm_monitor in be.conf to control
jvm monitor. (#35608)
## Proposed changes
before pr : #35023
In order to prevent doris_be from crashing when collecting jvm
information due to jvm incompatibility issues, you can set
`enable_jvm_monitor = true / false` in `be.conf` to enable the jvm
metrics.
The default value of `enable_jvm_monitor` is false.
When JVM monitoring has 30 consecutive exceptions, turn off JVM
information collection and set all values to 0.
Issue Number: close #xxx
<!--Describe your changes.-->
---------
Co-authored-by: morningman <[email protected]>
---
be/src/common/config.cpp | 4 +
be/src/common/config.h | 4 +
be/src/util/jvm_metrics.cpp | 112 ++++++++++++++++++++-----
be/src/util/jvm_metrics.h | 16 ++--
regression-test/pipeline/external/conf/be.conf | 3 +
regression-test/pipeline/p0/conf/be.conf | 3 +
regression-test/pipeline/p1/conf/be.conf | 3 +
7 files changed, 117 insertions(+), 28 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 3e8d72dba55..ee21430a970 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1283,8 +1283,12 @@ DEFINE_Int64(max_nonblock_close_thread_num, "64");
DEFINE_mDouble(mem_alloc_fault_probability, "0.0");
// The time out milliseconds for remote fetch schema RPC, default 60s
DEFINE_mInt64(fetch_remote_schema_rpc_timeout_ms, "60000");
+
DEFINE_Int64(s3_file_system_local_upload_buffer_size, "5242880");
+//JVM monitoring enable. To prevent be from crashing due to jvm compatibility
issues. The default setting is off.
+DEFINE_Bool(enable_jvm_monitor, "false");
+
// clang-format off
#ifdef BE_TEST
// test s3
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 20d85077e3f..8df54c25318 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1361,8 +1361,12 @@ DECLARE_mDouble(mem_alloc_fault_probability);
// The time out milliseconds for remote fetch schema RPC
DECLARE_mInt64(fetch_remote_schema_rpc_timeout_ms);
// The size of the local buffer for S3FileSytem's upload function
+
DECLARE_Int64(s3_file_system_local_upload_buffer_size);
+//JVM monitoring enable. To prevent be from crashing due to jvm compatibility
issues.
+DECLARE_Bool(enable_jvm_monitor);
+
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);
diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp
index e55cf8f3fbe..fc30d1073ac 100644
--- a/be/src/util/jvm_metrics.cpp
+++ b/be/src/util/jvm_metrics.cpp
@@ -17,10 +17,12 @@
#include "jvm_metrics.h"
+#include <util/jni-util.h>
+
#include <functional>
+#include "common/config.h"
#include "util/metrics.h"
-
namespace doris {
#define DEFINE_JVM_SIZE_BYTES_METRIC(name, type)
\
@@ -76,15 +78,28 @@
DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(jvm_gc_g1_old_generation_time_ms, MetricUni
const char* JvmMetrics::_s_hook_name = "jvm_metrics";
-JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) :
_jvm_stats(env) {
+JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) {
DCHECK(registry != nullptr);
_registry = registry;
_server_entity = _registry->register_entity("server");
DCHECK(_server_entity != nullptr);
- if (_jvm_stats.init_complete()) {
+
+ do {
+ if (!doris::config::enable_jvm_monitor) {
+ break;
+ }
+ try {
+ _jvm_stats.init(env);
+ } catch (...) {
+ LOG(WARNING) << "JVM STATS INIT FAIL";
+ break;
+ }
+ if (!_jvm_stats.init_complete()) {
+ break;
+ }
_server_entity->register_hook(_s_hook_name,
std::bind(&JvmMetrics::update, this));
- }
+ } while (false);
INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_max);
INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_committed);
@@ -117,11 +132,58 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv*
env) : _jvm_stats(env)
}
void JvmMetrics::update() {
- _jvm_stats.refresh(this);
+ static long fail_count = 0;
+ bool have_exception = false;
+ try {
+ _jvm_stats.refresh(this);
+ } catch (...) {
+ have_exception = true;
+ LOG(WARNING) << "JVM MONITOR UPDATE FAIL!";
+ fail_count++;
+ }
+
+ //When 30 consecutive exceptions occur, turn off jvm information
collection.
+ if (!have_exception) {
+ fail_count = 0;
+ }
+ if (fail_count >= 30) {
+ LOG(WARNING) << "JVM MONITOR CLOSE!";
+ _jvm_stats.set_complete(false);
+ _server_entity->deregister_hook(_s_hook_name);
+
+ jvm_heap_size_bytes_max->set_value(0);
+ jvm_heap_size_bytes_committed->set_value(0);
+ jvm_heap_size_bytes_used->set_value(0);
+
+ jvm_non_heap_size_bytes_used->set_value(0);
+ jvm_non_heap_size_bytes_committed->set_value(0);
+
+ jvm_young_size_bytes_used->set_value(0);
+ jvm_young_size_bytes_peak_used->set_value(0);
+ jvm_young_size_bytes_max->set_value(0);
+
+ jvm_old_size_bytes_used->set_value(0);
+ jvm_old_size_bytes_peak_used->set_value(0);
+ jvm_old_size_bytes_max->set_value(0);
+
+ jvm_thread_count->set_value(0);
+ jvm_thread_peak_count->set_value(0);
+ jvm_thread_new_count->set_value(0);
+ jvm_thread_runnable_count->set_value(0);
+ jvm_thread_blocked_count->set_value(0);
+ jvm_thread_waiting_count->set_value(0);
+ jvm_thread_timed_waiting_count->set_value(0);
+ jvm_thread_terminated_count->set_value(0);
+
+ jvm_gc_g1_young_generation_count->set_value(0);
+ jvm_gc_g1_young_generation_time_ms->set_value(0);
+ jvm_gc_g1_old_generation_count->set_value(0);
+ jvm_gc_g1_old_generation_time_ms->set_value(0);
+ }
}
-#include <util/jni-util.h>
-jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) {
+void JvmStats::init(JNIEnv* ENV) {
+ env = ENV;
_managementFactoryClass =
env->FindClass("java/lang/management/ManagementFactory");
if (_managementFactoryClass == nullptr) {
LOG(WARNING)
@@ -244,15 +306,19 @@ jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) {
LOG(INFO) << "Start JVM monitoring.";
_init_complete = true;
+ return;
}
-#include "jni.h"
-
-void jvmStats::refresh(JvmMetrics* jvm_metrics) {
+void JvmStats::refresh(JvmMetrics* jvm_metrics) {
if (!_init_complete) {
return;
}
- static_cast<void>(JniUtil::GetJNIEnv(&env));
+
+ Status st = JniUtil::GetJNIEnv(&env);
+ if (!st.ok()) {
+ LOG(WARNING) << "JVM STATS GET JNI ENV FAIL";
+ return;
+ }
jobject memoryMXBeanObj =
env->CallStaticObjectMethod(_managementFactoryClass,
_getMemoryMXBeanMethod);
@@ -302,8 +368,8 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) {
jstring name =
(jstring)env->CallObjectMethod(memoryPoolMXBean,
_getMemoryPollMXBeanNameMethod);
- const char* nameStr = env->GetStringUTFChars(name, NULL);
- if (nameStr != NULL) {
+ const char* nameStr = env->GetStringUTFChars(name, nullptr);
+ if (nameStr != nullptr) {
auto it = _memoryPoolName.find(nameStr);
if (it == _memoryPoolName.end()) {
continue;
@@ -408,16 +474,22 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) {
env->DeleteLocalRef(threadMXBean);
env->DeleteLocalRef(gcMXBeansList);
}
-jvmStats::~jvmStats() {
+JvmStats::~JvmStats() {
if (!_init_complete) {
return;
}
- env->DeleteLocalRef(_newThreadStateObj);
- env->DeleteLocalRef(_runnableThreadStateObj);
- env->DeleteLocalRef(_blockedThreadStateObj);
- env->DeleteLocalRef(_waitingThreadStateObj);
- env->DeleteLocalRef(_timedWaitingThreadStateObj);
- env->DeleteLocalRef(_terminatedThreadStateObj);
+ try {
+ env->DeleteLocalRef(_newThreadStateObj);
+ env->DeleteLocalRef(_runnableThreadStateObj);
+ env->DeleteLocalRef(_blockedThreadStateObj);
+ env->DeleteLocalRef(_waitingThreadStateObj);
+ env->DeleteLocalRef(_timedWaitingThreadStateObj);
+ env->DeleteLocalRef(_terminatedThreadStateObj);
+
+ } catch (...) {
+ // When be is killed, DeleteLocalRef may fail.
+ // In order to exit more gracefully, we catch the exception here.
+ }
}
} // namespace doris
diff --git a/be/src/util/jvm_metrics.h b/be/src/util/jvm_metrics.h
index 5f9929d8cf0..459a3cbf938 100644
--- a/be/src/util/jvm_metrics.h
+++ b/be/src/util/jvm_metrics.h
@@ -17,8 +17,6 @@
#pragma once
-#include <jni.h>
-
#include "jni.h"
#include "util/jni-util.h"
#include "util/metrics.h"
@@ -27,7 +25,7 @@ namespace doris {
class JvmMetrics;
-class jvmStats {
+class JvmStats {
private:
JNIEnv* env = nullptr;
jclass _managementFactoryClass = nullptr;
@@ -98,16 +96,18 @@ private:
bool _init_complete = false;
public:
- jvmStats(JNIEnv* ENV);
- bool init_complete() { return _init_complete; }
+ // JvmStats(JNIEnv* ENV);
+ void init(JNIEnv* ENV);
+ bool init_complete() const { return _init_complete; }
+ void set_complete(bool val) { _init_complete = val; }
void refresh(JvmMetrics* jvm_metrics);
- ~jvmStats();
+ ~JvmStats();
};
class JvmMetrics {
public:
JvmMetrics(MetricRegistry* registry, JNIEnv* env);
- ~JvmMetrics() {}
+ ~JvmMetrics() = default;
void update();
IntGauge* jvm_heap_size_bytes_max = nullptr;
@@ -140,7 +140,7 @@ public:
IntGauge* jvm_gc_g1_old_generation_time_ms = nullptr;
private:
- jvmStats _jvm_stats;
+ JvmStats _jvm_stats;
std::shared_ptr<MetricEntity> _server_entity;
static const char* _s_hook_name;
MetricRegistry* _registry = nullptr;
diff --git a/regression-test/pipeline/external/conf/be.conf
b/regression-test/pipeline/external/conf/be.conf
index 85fedf8873f..6fb930cc5ea 100644
--- a/regression-test/pipeline/external/conf/be.conf
+++ b/regression-test/pipeline/external/conf/be.conf
@@ -72,3 +72,6 @@ enable_set_in_bitmap_value=true
enable_feature_binlog=true
trino_connector_plugin_dir=/tmp/trino_connector/connectors
+
+enable_jvm_monitor = true
+
diff --git a/regression-test/pipeline/p0/conf/be.conf
b/regression-test/pipeline/p0/conf/be.conf
index 474c30a05de..67605a5bdd9 100644
--- a/regression-test/pipeline/p0/conf/be.conf
+++ b/regression-test/pipeline/p0/conf/be.conf
@@ -60,3 +60,6 @@ enable_debug_points=true
enable_debug_log_timeout_secs=0
trino_connector_plugin_dir=/tmp/trino_connector/connectors
+
+enable_jvm_monitor = true
+
diff --git a/regression-test/pipeline/p1/conf/be.conf
b/regression-test/pipeline/p1/conf/be.conf
index fde67fbbaf7..d278b30fb67 100644
--- a/regression-test/pipeline/p1/conf/be.conf
+++ b/regression-test/pipeline/p1/conf/be.conf
@@ -58,3 +58,6 @@ user_files_secure_path=/
enable_debug_points=true
# debug scanner context dead loop
enable_debug_log_timeout_secs=0
+
+enable_jvm_monitor = true
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]