Fix yamllint issues
parent
804ddfa89d
commit
5e8a43056a
@ -0,0 +1,8 @@
|
||||
---
|
||||
# see man 1 yamllint for the defaults
|
||||
extends: default
|
||||
|
||||
rules:
|
||||
line-length: disable
|
||||
comments:
|
||||
min-spaces-from-content: 1
|
@ -1,92 +1,93 @@
|
||||
---
|
||||
# This file is managed by Puppet
|
||||
|
||||
groups:
|
||||
- name: rules/cockroach-aggregation.rules
|
||||
rules:
|
||||
- record: node:capacity
|
||||
expr: sum without(store) (capacity{job="cockroachdb"})
|
||||
- record: cluster:capacity
|
||||
expr: sum without(instance) (node:capacity{job="cockroachdb"})
|
||||
- record: node:capacity_available
|
||||
expr: sum without(store) (capacity_available{job="cockroachdb"})
|
||||
- record: cluster:capacity_available
|
||||
expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
|
||||
- record: capacity_available:ratio
|
||||
expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
|
||||
- record: node:capacity_available:ratio
|
||||
expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
|
||||
- record: cluster:capacity_available:ratio
|
||||
expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
|
||||
# Histogram rules: these are fairly expensive to compute live, so we precompute a few percetiles.
|
||||
- record: txn_durations_bucket:rate1m
|
||||
expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
|
||||
- record: txn_durations:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
|
||||
- record: exec_latency_bucket:rate1m
|
||||
expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: exec_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
|
||||
- record: round_trip_latency_bucket:rate1m
|
||||
expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: round_trip_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency_bucket:rate1m
|
||||
expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: sql_exec_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency_bucket:rate1m
|
||||
expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency_bucket:rate1m
|
||||
expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- name: rules/cockroach-aggregation.rules
|
||||
rules:
|
||||
- record: node:capacity
|
||||
expr: sum without(store) (capacity{job="cockroachdb"})
|
||||
- record: cluster:capacity
|
||||
expr: sum without(instance) (node:capacity{job="cockroachdb"})
|
||||
- record: node:capacity_available
|
||||
expr: sum without(store) (capacity_available{job="cockroachdb"})
|
||||
- record: cluster:capacity_available
|
||||
expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
|
||||
- record: capacity_available:ratio
|
||||
expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
|
||||
- record: node:capacity_available:ratio
|
||||
expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
|
||||
- record: cluster:capacity_available:ratio
|
||||
expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
|
||||
# Histogram rules: these are fairly expensive to compute live, so we precompute a few percetiles.
|
||||
- record: txn_durations_bucket:rate1m
|
||||
expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
|
||||
- record: txn_durations:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
|
||||
- record: txn_durations:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
|
||||
- record: exec_latency_bucket:rate1m
|
||||
expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: exec_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
|
||||
- record: exec_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
|
||||
- record: round_trip_latency_bucket:rate1m
|
||||
expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: round_trip_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
|
||||
- record: round_trip_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency_bucket:rate1m
|
||||
expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: sql_exec_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
|
||||
- record: sql_exec_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency_bucket:rate1m
|
||||
expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_logcommit_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency_bucket:rate1m
|
||||
expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_50
|
||||
expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_75
|
||||
expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_90
|
||||
expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_95
|
||||
expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
- record: raft_process_commandcommit_latency:rate1m:quantile_99
|
||||
expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)
|
||||
|
@ -1,45 +1,47 @@
|
||||
---
|
||||
# This file is managed by Puppet
|
||||
|
||||
groups:
|
||||
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml
|
||||
rules:
|
||||
- alert: node_cpu_usage
|
||||
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
|
||||
node_meta * 100) BY (node_name)) > 50
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
|
||||
$value}}%.
|
||||
summary: CPU alert for Swarm node '{{ $labels.node_name }}'
|
||||
- alert: node_memory_usage
|
||||
expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
|
||||
* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
|
||||
$value}}%.
|
||||
summary: Memory alert for Swarm node '{{ $labels.node_name }}'
|
||||
- alert: node_disk_usage
|
||||
expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
|
||||
* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
|
||||
node_meta > 85
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
|
||||
$value}}%.
|
||||
summary: Disk alert for Swarm node '{{ $labels.node_name }}'
|
||||
- alert: node_disk_fill_rate_6h
|
||||
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
|
||||
GROUP_LEFT(node_name) node_meta < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} disk is going to fill up in
|
||||
6h.
|
||||
summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'
|
||||
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml
|
||||
rules:
|
||||
- alert: node_cpu_usage
|
||||
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
|
||||
node_meta * 100) BY (node_name)) > 50
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
|
||||
$value}}%.
|
||||
summary: CPU alert for Swarm node '{{ $labels.node_name }}'
|
||||
- alert: node_memory_usage
|
||||
expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
|
||||
* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
|
||||
$value}}%.
|
||||
summary: Memory alert for Swarm node '{{ $labels.node_name }}'
|
||||
- alert: node_disk_usage
|
||||
expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
|
||||
* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
|
||||
node_meta > 85
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
|
||||
$value}}%.
|
||||
summary: Disk alert for Swarm node '{{ $labels.node_name }}'
|
||||
- alert: node_disk_fill_rate_6h
|
||||
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
|
||||
GROUP_LEFT(node_name) node_meta < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Swarm node {{ $labels.node_name }} disk is going to fill up in
|
||||
6h.
|
||||
summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'
|
||||
|
@ -1,25 +1,27 @@
|
||||
---
|
||||
# This file is managed by Puppet
|
||||
|
||||
groups:
|
||||
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml
|
||||
rules:
|
||||
- alert: task_high_cpu_usage_50
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
|
||||
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
|
||||
* 100 > 50
|
||||
for: 1m
|
||||
annotations:
|
||||
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
|
||||
$labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize
|
||||
$value}}%.'
|
||||
summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
|
||||
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
|
||||
- alert: task_high_memory_usage_1g
|
||||
expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
|
||||
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09
|
||||
for: 1m
|
||||
annotations:
|
||||
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
|
||||
$labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize
|
||||
$value}}.'
|
||||
summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
|
||||
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
|
||||
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml
|
||||
rules:
|
||||
- alert: task_high_cpu_usage_50
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
|
||||
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
|
||||
* 100 > 50
|
||||
for: 1m
|
||||
annotations:
|
||||
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
|
||||
$labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize
|
||||
$value}}%.'
|
||||
summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
|
||||
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
|
||||
- alert: task_high_memory_usage_1g
|
||||
expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
|
||||
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09
|
||||
for: 1m
|
||||
annotations:
|
||||
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
|
||||
$labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize
|
||||
$value}}.'
|
||||
summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
|
||||
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
|
||||
|
Loading…
Reference in New Issue