Fix yamllint issues

master
Jack Henschel 2 years ago
parent 804ddfa89d
commit 5e8a43056a

@ -0,0 +1,8 @@
---
# see man 1 yamllint for the defaults
extends: default
rules:
line-length: disable
comments:
min-spaces-from-content: 1

@ -470,8 +470,8 @@ profile::mariadb::backup_dir: '/var/backups/mariadb/'
# Hetzner DNS servers
profile::dns::primary_dns_servers:
- '213.133.100.100'
# - '213.133.99.99'
# - '2a01:4f8:0:1::add:9999'
# - '213.133.99.99'
# - '2a01:4f8:0:1::add:9999'
- '2a01:4f8:0:1::add:1010'
# TODO: use local DNS server cache for docker
# profile::docker::dns: ["127.0.0.53"]

@ -1,92 +1,93 @@
---
# This file is managed by Puppet
groups:
- name: rules/cockroach-aggregation.rules
rules:
- record: node:capacity
expr: sum without(store) (capacity{job="cockroachdb"})
- record: cluster:capacity
expr: sum without(instance) (node:capacity{job="cockroachdb"})
- record: node:capacity_available
expr: sum without(store) (capacity_available{job="cockroachdb"})
- record: cluster:capacity_available
expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
- record: capacity_available:ratio
expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
- record: node:capacity_available:ratio
expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
- record: cluster:capacity_available:ratio
expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
# Histogram rules: these are fairly expensive to compute live, so we precompute a few percetiles.
- record: txn_durations_bucket:rate1m
expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
- record: txn_durations:rate1m:quantile_50
expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_75
expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_90
expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_95
expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_99
expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
- record: exec_latency_bucket:rate1m
expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
- record: exec_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
- record: round_trip_latency_bucket:rate1m
expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
- record: round_trip_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
- record: sql_exec_latency_bucket:rate1m
expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
- record: sql_exec_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
- record: raft_process_logcommit_latency_bucket:rate1m
expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
- record: raft_process_logcommit_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency_bucket:rate1m
expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
- record: raft_process_commandcommit_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)
- name: rules/cockroach-aggregation.rules
rules:
- record: node:capacity
expr: sum without(store) (capacity{job="cockroachdb"})
- record: cluster:capacity
expr: sum without(instance) (node:capacity{job="cockroachdb"})
- record: node:capacity_available
expr: sum without(store) (capacity_available{job="cockroachdb"})
- record: cluster:capacity_available
expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
- record: capacity_available:ratio
expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
- record: node:capacity_available:ratio
expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
- record: cluster:capacity_available:ratio
expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
# Histogram rules: these are fairly expensive to compute live, so we precompute a few percetiles.
- record: txn_durations_bucket:rate1m
expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
- record: txn_durations:rate1m:quantile_50
expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_75
expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_90
expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_95
expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
- record: txn_durations:rate1m:quantile_99
expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
- record: exec_latency_bucket:rate1m
expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
- record: exec_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
- record: exec_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
- record: round_trip_latency_bucket:rate1m
expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
- record: round_trip_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
- record: round_trip_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
- record: sql_exec_latency_bucket:rate1m
expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
- record: sql_exec_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
- record: sql_exec_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
- record: raft_process_logcommit_latency_bucket:rate1m
expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
- record: raft_process_logcommit_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_logcommit_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency_bucket:rate1m
expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
- record: raft_process_commandcommit_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)

@ -1,6 +1,8 @@
---
# This file is managed by Puppet
global:
scrape_interval: 30s
scrape_interval: 30s
evaluation_interval: 30s
external_labels:
@ -12,7 +14,7 @@ rule_files:
- "cockroach-aggregation.rules.yml"
# TODO: alerting with prometheus
#alerting:
# alerting:
# alertmanagers:
# - static_configs:
# - targets:
@ -26,40 +28,40 @@ scrape_configs:
- job_name: 'dockerd-exporter'
dns_sd_configs:
- names:
- 'tasks.dockerd-exporter'
type: 'A'
port: 9323
- names:
- 'tasks.dockerd-exporter'
type: 'A'
port: 9323
- job_name: 'cadvisor'
dns_sd_configs:
- names:
- 'tasks.cadvisor'
type: 'A'
port: 8080
- names:
- 'tasks.cadvisor'
type: 'A'
port: 8080
- job_name: 'node'
dns_sd_configs:
- names:
- 'tasks.node-exporter'
type: 'A'
port: 9100
- names:
- 'tasks.node-exporter'
type: 'A'
port: 9100
# this is a pretty ugly hack, but works fine since we just have one node
# for proper solutions see:
# https://github.com/prometheus-operator/prometheus-operator/issues/135
relabel_configs:
- source_labels: [__address__]
regex: '.*'
target_label: instance
replacement: '02.ht.cubieserver.de'
- source_labels: [__address__]
regex: '.*'
target_label: instance
replacement: '02.ht.cubieserver.de'
- job_name: 'cockroachdb'
metrics_path: '/_status/vars'
dns_sd_configs:
- names:
- 'tasks.cockroach'
type: 'A'
port: 8080
- names:
- 'tasks.cockroach'
type: 'A'
port: 8080
relabel_configs:
- source_labels: [__address__] # can be anything
target_label: 'cluster'
@ -71,21 +73,21 @@ scrape_configs:
<% end -%>
metrics_path: /minio/prometheus/metrics
dns_sd_configs:
- names:
- 'tasks.minio'
type: 'A'
port: 9000
- names:
- 'tasks.minio'
type: 'A'
port: 9000
- job_name: traefik
dns_sd_configs:
- names:
- 'tasks.traefik'
type: 'A'
port: 8080
- names:
- 'tasks.traefik'
type: 'A'
port: 8080
- job_name: nextcloud
dns_sd_configs:
- names:
- 'tasks.nextcloud-exporter'
type: 'A'
port: 9205
- names:
- 'tasks.nextcloud-exporter'
type: 'A'
port: 9205

@ -1,45 +1,47 @@
---
# This file is managed by Puppet
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml
rules:
- alert: node_cpu_usage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
node_meta * 100) BY (node_name)) > 50
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
$value}}%.
summary: CPU alert for Swarm node '{{ $labels.node_name }}'
- alert: node_memory_usage
expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
$value}}%.
summary: Memory alert for Swarm node '{{ $labels.node_name }}'
- alert: node_disk_usage
expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
node_meta > 85
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
$value}}%.
summary: Disk alert for Swarm node '{{ $labels.node_name }}'
- alert: node_disk_fill_rate_6h
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
GROUP_LEFT(node_name) node_meta < 0
for: 1h
labels:
severity: critical
annotations:
description: Swarm node {{ $labels.node_name }} disk is going to fill up in
6h.
summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_node.rules.yml
rules:
- alert: node_cpu_usage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m]) * ON(instance) GROUP_LEFT(node_name)
node_meta * 100) BY (node_name)) > 50
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} CPU usage is at {{ humanize
$value}}%.
summary: CPU alert for Swarm node '{{ $labels.node_name }}'
- alert: node_memory_usage
expr: sum(((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes)
* ON(instance) GROUP_LEFT(node_name) node_meta * 100) BY (node_name) > 80
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} memory usage is at {{ humanize
$value}}%.
summary: Memory alert for Swarm node '{{ $labels.node_name }}'
- alert: node_disk_usage
expr: ((node_filesystem_size_bytes{mountpoint="/rootfs"} - node_filesystem_free_bytes{mountpoint="/rootfs"})
* 100 / node_filesystem_size_bytes{mountpoint="/rootfs"}) * ON(instance) GROUP_LEFT(node_name)
node_meta > 85
for: 1m
labels:
severity: warning
annotations:
description: Swarm node {{ $labels.node_name }} disk usage is at {{ humanize
$value}}%.
summary: Disk alert for Swarm node '{{ $labels.node_name }}'
- alert: node_disk_fill_rate_6h
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/rootfs"}[1h], 6 * 3600) * ON(instance)
GROUP_LEFT(node_name) node_meta < 0
for: 1h
labels:
severity: critical
annotations:
description: Swarm node {{ $labels.node_name }} disk is going to fill up in
6h.
summary: Disk fill alert for Swarm node '{{ $labels.node_name }}'

@ -1,25 +1,27 @@
---
# This file is managed by Puppet
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml
rules:
- alert: task_high_cpu_usage_50
expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
* 100 > 50
for: 1m
annotations:
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
$labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize
$value}}%.'
summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
- alert: task_high_memory_usage_1g
expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09
for: 1m
annotations:
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
$labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize
$value}}.'
summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml
rules:
- alert: task_high_cpu_usage_50
expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
* 100 > 50
for: 1m
annotations:
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
$labels.container_label_com_docker_swarm_node_id }}'' CPU usage is at {{ humanize
$value}}%.'
summary: CPU alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'
- alert: task_high_memory_usage_1g
expr: sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"})
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id) > 1e+09
for: 1m
annotations:
description: '{{ $labels.container_label_com_docker_swarm_task_name }} on ''{{
$labels.container_label_com_docker_swarm_node_id }}'' memory usage is {{ humanize
$value}}.'
summary: Memory alert for Swarm task '{{ $labels.container_label_com_docker_swarm_task_name
}}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'

Loading…
Cancel
Save