Copilot commented on code in PR #13903:
URL: https://github.com/apache/skywalking/pull/13903#discussion_r3389708730
##########
oap-server/server-starter/src/main/resources/otel-rules/banyandb/banyandb-instance.yaml:
##########
@@ -13,74 +13,157 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# This will parse a textual representation of a duration. The formats
-# accepted are based on the ISO-8601 duration format {@code PnDTnHnMn.nS}
-# with days considered to be exactly 24 hours.
-# <p>
-# Examples:
-# <pre>
-# "PT20.345S" -- parses as "20.345 seconds"
-# "PT15M" -- parses as "15 minutes" (where a minute is 60 seconds)
-# "PT10H" -- parses as "10 hours" (where an hour is 3600 seconds)
-# "P2D" -- parses as "2 days" (where a day is 24 hours or 86400
seconds)
-# "P2DT3H4M" -- parses as "2 days, 3 hours and 4 minutes"
-# "P-6H3M" -- parses as "-6 hours and +3 minutes"
-# "-P6H3M" -- parses as "-6 hours and -3 minutes"
-# "-P-6H+3M" -- parses as "+6 hours and -3 minutes"
-# </pre>
+# SWIP-15: BanyanDB self-observability, ServiceInstance scope = one container
on a node.
+# The instance identity is pod_name + container_name (a data hot/warm pod
co-hosts a data and a
+# lifecycle container under one pod_name), joined by '@'. role
(container_name) and tier (node_type)
+# ride as instance attributes via the 6-arg instance() properties closure;
node_type Elvis-defaults
+# to 'n/a' off data containers (it is absent on liaison samples, present on
every ROLE_DATA sample).
+#
+# Every rule that aggregates keeps
['cluster','pod_name','container_name','node_role','node_type'] in
+# its .sum()/.avg()/.max() group-by: SampleFamily.aggregate() drops labels not
in the group-by, and
+# the properties closure reads them from the post-aggregation sample
(SampleFamily.java:810). node_type
+# rides on every ROLE_DATA sample (system_*, go_*, process_* included), so a
data instance resolves a
+# stable tier across all rules; liaison families carry none, so liaison
resolves 'n/a' consistently.
+#
+# Source expressions mirror the upstream BanyanDB Grafana "Nodes" board
+# (docs/operation/grafana-fodc-nodes.json) plus the liaison/data rows of the
"Workload" board, so the
+# SkyWalking instance dashboard stays in lockstep with the upstream catalog.
filter: "{ tags -> tags.job_name == 'banyandb-monitoring' }"
-expSuffix: tag({tags -> tags.host_name = 'banyandb::' +
tags.host_name}).service(['host_name'] ,
Layer.BANYANDB).instance(['host_name'], ['service_instance_id'], Layer.BANYANDB)
-metricPrefix: meter_banyandb
+expSuffix: |-
+ service(['cluster'], Layer.BANYANDB)
+ .instance(['cluster'], '::', ['pod_name', 'container_name'], '@',
Layer.BANYANDB, { tags -> ['node_role': tags.node_role, 'node_type':
tags.node_type ?: 'n/a', 'pod_name': tags.pod_name, 'container_name':
tags.container_name] })
+metricPrefix: meter_banyandb_instance
metricsRules:
- - name: instance_write_rate
- exp:
banyandb_measure_total_written.rate('PT15S')+banyandb_stream_tst_total_written.rate('PT15S')
- - name: instance_total_memory
- exp: banyandb_system_memory_state.tagEqual('kind','total')
- - name: instance_disk_usage
- exp:
banyandb_system_disk.tagEqual('kind','used').sum(['host_name','service_instance_id'])
- - name: instance_query_rate
- exp:
banyandb_liaison_grpc_total_started.sum(['method','host_name','service_instance_id'])
- - name: instance_total_cpu
- exp: banyandb_system_cpu_num
- - name: instance_write_and_query_errors_rate
- exp:
banyandb_liaison_grpc_total_err.tagEqual('method','query').sum(['method','host_name','service_instance_id']).rate('PT15S')*60
+
banyandb_liaison_grpc_total_stream_msg_sent_err.sum(['host_name','service_instance_id']).rate('PT15S')*60
+
banyandb_liaison_grpc_total_stream_msg_received_err.sum(['host_name','service_instance_id']).rate('PT15S')*60
+
banyandb_queue_sub_total_msg_sent_err.sum(['host_name','service_instance_id']).rate('PT15S')*60
- - name: instance_etcd_operation_rate
- exp:
banyandb_liaison_grpc_total_registry_started.sum(['host_name','service_instance_id']).rate('PT15S')
+
banyandb_liaison_grpc_total_started.sum(['host_name','service_instance_id']).rate('PT15S')
- - name: instance_active_instance
- exp: up.sum(['host_name','service_instance_id']).downsampling(MIN)
- - name: instance_cpu_usage
- exp:
(((process_cpu_seconds_total.sum(['host_name','service_instance_id']).rate('PT15S')
/
banyandb_system_cpu_num.sum(['host_name','service_instance_id']))).max(['host_name','service_instance_id']))*1000
- - name: instance_rss_memory_usage
- exp:
((process_resident_memory_bytes.sum(['host_name','service_instance_id']).downsampling(MAX)
/
banyandb_system_memory_state.tagEqual('kind','total').sum(['host_name','service_instance_id'])).max(['host_name','service_instance_id']))*1000
- - name: instance_disk_usage_all
- exp:
((banyandb_system_disk.tagEqual('kind','used').sum(['host_name','service_instance_id'])
/
banyandb_system_memory_state.tagEqual('kind','total').sum(['host_name','service_instance_id'])).max(['host_name','service_instance_id']))*1000
- - name: instance_network_usage_recv
- exp:
banyandb_system_net_state.tagEqual('kind','bytes_recv').sum(['host_name','service_instance_id']).rate('PT15S')
- - name: instance_network_usage_sent
- exp:
banyandb_system_net_state.tagEqual('kind','bytes_sent').sum(['host_name','service_instance_id']).rate('PT15S')
- - name: instance_storage_write_rate
- exp:
banyandb_measure_total_written.sum(['group','host_name','service_instance_id']).rate('PT15S')*1000
- - name: instance_query_latency
- exp:
(banyandb_liaison_grpc_total_latency.tagEqual('method','query').sum(['group','host_name','service_instance_id']).rate('PT15S')
/
banyandb_liaison_grpc_total_started.tagEqual('method','query').sum(['group','host_name','service_instance_id']).rate('PT15S'))*1000
- - name: instance_total_data
- exp:
banyandb_measure_total_file_elements.sum(['group','host_name','service_instance_id'])
- - name: instance_merge_file_data
- exp:
banyandb_measure_total_merge_loop_started.sum(['group','host_name','service_instance_id']).rate('PT15S')
* 60 *1000
- - name: instance_merge_file_latency
- exp:
(banyandb_measure_total_merge_latency.tagEqual('type','file').sum(['group','host_name','service_instance_id']).rate('PT15S')
/
banyandb_measure_total_merge_loop_started.sum(['group','host_name','service_instance_id']).rate('PT15S'))*1000
- - name: instance_merge_file_partitions
- exp:
(banyandb_measure_total_merged_parts.tagEqual('type','file').sum(['group','host_name','service_instance_id']).rate('PT15S')
/
banyandb_measure_total_merge_loop_started.sum(['group','host_name','service_instance_id']).rate('PT15S'))*1000
- - name: instance_series_write_rate
- exp:
(banyandb_measure_inverted_index_total_updates.sum(['group','host_name','service_instance_id']).rate('PT15S'))*1000
- - name: instance_series_term_search_rate
- exp:
banyandb_stream_storage_inverted_index_total_term_searchers_started.sum(['group','host_name','service_instance_id']).rate('PT15S')
- - name: instance_total_series
- exp:
banyandb_measure_inverted_index_total_doc_count.sum(['group','host_name','service_instance_id'])
- - name: instance_stream_write_rate
- exp:
banyandb_stream_tst_inverted_index_total_updates.sum(['group','host_name','service_instance_id']).rate('PT15S')
- - name: instance_term_search_rate
- exp:
banyandb_stream_tst_inverted_index_total_term_searchers_started.sum(['group','host_name','service_instance_id']).rate('PT15S')*
1000
- - name: instance_total_document
- exp:
banyandb_stream_tst_inverted_index_total_doc_count.sum(['group','host_name','service_instance_id'])
+ # ---- All roles: Resources / Disk by Path / Go Runtime (every container
emits these) ----
+ # node uptime (s). Raw gauge; ABSENT on lifecycle containers (their binary
runs the metric service
+ # without the system collector), so the lifecycle instance shows no uptime.
+ - name: node_uptime
+ exp: banyandb_system_up_time
+ # CPU usage (cores). process_* rides on every container including lifecycle.
+ - name: cpu_usage
+ exp:
process_cpu_seconds_total.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
+ # resident memory (bytes). Raw gauge, present on all containers.
+ - name: rss_memory
+ exp: process_resident_memory_bytes
+ # system memory used %. kind='used_percent' is emitted directly (a 0-1
fraction; source divides by 100).
+ - name: system_memory_percent
+ exp: banyandb_system_memory_state.tagEqual('kind','used_percent')
+ # disk used % = Σused / Σtotal across the node's data paths (matches the
Grafana "Disk Usage %" panel).
+ - name: disk_usage_percent
+ exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type'])
/
banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type'])
+ # disk used / total / used% broken out per mount path.
+ - name: disk_used_by_path
+ exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
+ - name: disk_total_by_path
+ exp:
banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
+ - name: disk_used_percent_by_path
+ exp:
banyandb_system_disk.tagEqual('kind','used').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
/
banyandb_system_disk.tagEqual('kind','total').sum(['cluster','pod_name','container_name','node_role','node_type','path'])
+ # network throughput (bytes/s) by interface name.
+ - name: network_recv
+ exp:
banyandb_system_net_state.tagEqual('kind','bytes_recv').sum(['cluster','pod_name','container_name','node_role','node_type','name']).rate('PT15S')
+ - name: network_sent
+ exp:
banyandb_system_net_state.tagEqual('kind','bytes_sent').sum(['cluster','pod_name','container_name','node_role','node_type','name']).rate('PT15S')
+ # Go runtime.
+ - name: goroutines
+ exp: go_goroutines
+ # average GC pause (s) = rate(Σpause) / rate(Σcount). go_gc_duration_seconds
is a summary (no buckets),
+ # so this ratio of _sum/_count is the only valid average — do not apply
histogram_percentile to it.
+ - name: gc_pause_avg
+ exp:
go_gc_duration_seconds_sum.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
/
go_gc_duration_seconds_count.sum(['cluster','pod_name','container_name','node_role','node_type']).rate('PT15S')
Review Comment:
`gc_pause_avg` divides `rate(_sum)` by `rate(_count)`. When there are no GC
events in the window, `_count.rate(...)` can be 0, and MAL division will yield
`Infinity`/`NaN`. Using `safeDiv` avoids emitting invalid samples.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]