This is an automated email from the ASF dual-hosted git repository.
wu-sheng pushed a commit to branch swip-15-banyandb-so11y-rules
in repository https://gitbox.apache.org/repos/asf/skywalking.git
The following commit(s) were added to refs/heads/swip-15-banyandb-so11y-rules
by this push:
new 12ada47c03 SWIP-15: use safeDiv for all ratio metrics (avoid
NaN/Infinity on idle windows)
12ada47c03 is described below
commit 12ada47c03b92967333cfb8a77494717d862c4f8
Author: Wu Sheng <[email protected]>
AuthorDate: Thu Jun 11 00:06:30 2026 +0800
SWIP-15: use safeDiv for all ratio metrics (avoid NaN/Infinity on idle
windows)
Address review feedback: query_latency, merge_file_latency/partitions,
gc_pause_avg,
and disk-usage-percent divided counters/rates with the `/` operator, which
yields
NaN/Infinity when the denominator rate is 0 (no queries / merges / GC
events in a
window). Switch every division to SampleFamily.safeDiv (returns 0.0 for an
empty/zero
denominator), matching the shipped envoy-ai-gateway latency rules.
Boot-check
(DSLClassGeneratorTest) recompiles all rules clean.
Co-Authored-By: Claude Fable 5 <[email protected]>
---
.../main/resources/otel-rules/banyandb/banyandb-endpoint.yaml | 6 +++---
.../main/resources/otel-rules/banyandb/banyandb-instance.yaml | 10 +++++-----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git
a/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-endpoint.yaml
b/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-endpoint.yaml
index ef61c460c8..c208d1da66 100644
---
a/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-endpoint.yaml
+++
b/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-endpoint.yaml
@@ -39,7 +39,7 @@ metricsRules:
# not a percentile. Both filtered to method='query' and reduced to
['cluster','group']
# (collapsing the `service` data-model facet) before the division joins on
equal labels.
- name: query_latency
- exp: (banyandb_liaison_grpc_total_latency.tagEqual('method',
'query').sum(['cluster', 'group']) /
banyandb_liaison_grpc_total_started.tagEqual('method', 'query').sum(['cluster',
'group'])) * 1000
+ exp: banyandb_liaison_grpc_total_latency.tagEqual('method',
'query').sum(['cluster',
'group']).safeDiv(banyandb_liaison_grpc_total_started.tagEqual('method',
'query').sum(['cluster', 'group'])) * 1000
# current total stored data elements for the group (gauge). Dimensioned by
seg+shard+node_type
# across data nodes; .sum(['cluster','group']) collapses them into one
per-group total.
@@ -57,13 +57,13 @@ metricsRules:
# (liaison emits only type='mem'). Divide accumulated merge-seconds by merge
loops, both
# type/scope-aligned to ['cluster','group']. Matches the upstream "Merge
File Latency" panel.
- name: merge_file_latency
- exp: ((banyandb_measure_total_merge_latency.tagEqual('type',
'file').sum(['cluster', 'group']).rate('PT1M') /
banyandb_measure_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
(banyandb_stream_tst_total_merge_latency.tagEqual('type',
'file').sum(['cluster', 'group']).rate('PT1M') /
banyandb_stream_tst_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
(banyandb_trace_tst_total_merge_latency.tagEqual('type',
'file').sum(['cluster', 'group']).rate('PT1 [...]
+ exp: (banyandb_measure_total_merge_latency.tagEqual('type',
'file').sum(['cluster',
'group']).rate('PT1M').safeDiv(banyandb_measure_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
banyandb_stream_tst_total_merge_latency.tagEqual('type',
'file').sum(['cluster',
'group']).rate('PT1M').safeDiv(banyandb_stream_tst_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
banyandb_trace_tst_total_merge_latency.tagEqual('type', 'file').sum(['cluster',
'group']). [...]
# avg parts merged per merge loop on the on-disk merge path for the group
(matches the upstream
# "Merge File Partitions" panel = rate(merged_parts{type=file}) /
rate(merge_loop_started)).
# merged_parts carries `type`; type='file' is DATA-only (liaison emits only
type='mem').
- name: merge_file_partitions
- exp: ((banyandb_measure_total_merged_parts.tagEqual('type',
'file').sum(['cluster', 'group']).rate('PT1M') /
banyandb_measure_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
(banyandb_stream_tst_total_merged_parts.tagEqual('type',
'file').sum(['cluster', 'group']).rate('PT1M') /
banyandb_stream_tst_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
(banyandb_trace_tst_total_merged_parts.tagEqual('type', 'file').sum(['cluster',
'group']).rate('PT1M') [...]
+ exp: (banyandb_measure_total_merged_parts.tagEqual('type',
'file').sum(['cluster',
'group']).rate('PT1M').safeDiv(banyandb_measure_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
banyandb_stream_tst_total_merged_parts.tagEqual('type', 'file').sum(['cluster',
'group']).rate('PT1M').safeDiv(banyandb_stream_tst_total_merge_loop_started.sum(['cluster',
'group']).rate('PT1M')) +
banyandb_trace_tst_total_merged_parts.tagEqual('type', 'file').sum(['cluster',
'group']).rat [...]
# inverted-index updates/s for the group. NOTE:
*_inverted_index_total_updates is # TYPE=gauge
# though cumulative; rate() over a cumulative gauge yields a per-window
delta (updates/s). Stream
diff --git
a/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-instance.yaml
b/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-instance.yaml
index c3f728b139..6604455b81 100644
---
a/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-instance.yaml
+++
b/oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-instance.yaml
@@ -50,14 +50,14 @@ metricsRules:
exp: banyandb_system_memory_state.tagEqual('kind','used_percent')
# disk used % = Σused / Σtotal across the node's data paths (matches the
Grafana "Disk Usage %" panel).
- name: disk_usage_percent
- exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type'])
/
banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type'])
+ exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type']).safeDiv(banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type']))
# disk used / total / used% broken out per mount path.
- name: disk_used_by_path
exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
- name: disk_total_by_path
exp:
banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
- name: disk_used_percent_by_path
- exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
/
banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
+ exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type','path']).safeDiv(banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type','path']))
# network throughput (bytes/s) by interface name.
- name: network_recv
exp:
banyandb_system_net_state.tagEqual('kind','bytes_recv').sum(['cluster','pod_name','container_name','node_role','node_type','name']).rate('PT15S')
@@ -69,7 +69,7 @@ metricsRules:
# average GC pause (s) = rate(Σpause) / rate(Σcount). go_gc_duration_seconds
is a summary (no buckets),
# so this ratio of _sum/_count is the only valid average — do not apply
histogram_percentile to it.
- name: gc_pause_avg
- exp:
go_gc_duration_seconds_sum.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
/
go_gc_duration_seconds_count.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
+ exp:
go_gc_duration_seconds_sum.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S').safeDiv(go_gc_duration_seconds_count.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S'))
- name: heap_inuse
exp: go_memstats_heap_inuse_bytes
- name: heap_next_gc
@@ -120,10 +120,10 @@ metricsRules:
# avg parts merged per merge loop on the file path (matches Grafana =
rate(merged_parts{type=file}) /
# rate(merge_loop_started)). type='file' is data-only on the wire (liaison
emits only type='mem').
- name: merge_file_partitions
- exp:
(banyandb_measure_total_merged_parts.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
/
banyandb_measure_total_merge_loop_started.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S'))
+
(banyandb_stream_tst_total_merged_parts.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
/ banyandb_stream_tst_total_merge_loop_started.sum(['cluster', [...]
+ exp:
banyandb_measure_total_merged_parts.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S').safeDiv(banyandb_measure_total_merge_loop_started.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S'))
+
banyandb_stream_tst_total_merged_parts.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S').safeDiv(banyandb_stream_tst_total_merge_loop_started.sum([
[...]
# avg file-merge latency (ms) per merge loop.
- name: merge_file_latency
- exp:
((banyandb_measure_total_merge_latency.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
/
banyandb_measure_total_merge_loop_started.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S'))
+
(banyandb_stream_tst_total_merge_latency.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
/ banyandb_stream_tst_total_merge_loop_started.sum(['cluste [...]
+ exp:
(banyandb_measure_total_merge_latency.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S').safeDiv(banyandb_measure_total_merge_loop_started.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S'))
+
banyandb_stream_tst_total_merge_latency.tagEqual('type','file').sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S').safeDiv(banyandb_stream_tst_total_merge_loop_started.su
[...]
# inverted-index (series) write rate / term-search rate / total docs.
*_inverted_index_total_* are
# # TYPE=gauge but cumulative, so rate() yields a per-window delta. Stream's
series index is the
# storage scope (stream_storage_*); the tst scope is reported separately
below.