2022-07-12 23:27:07 +00:00
groups :
- name : Basic
rules :
# from https://awesome-prometheus-alerts.grep.to/rules.html#rule-prometheus-self-monitoring-1-2
- alert : PrometheusTargetMissing
expr : up == 0
2022-12-22 15:33:14 +00:00
for : 1m
2022-07-12 23:27:07 +00:00
labels :
severity : critical
annotations :
summary : Prometheus target missing (instance {{ $labels.instance }})
description : "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2022-09-14 00:16:15 +00:00
- alert : NodeRebooted
expr : changes(node_boot_time_seconds[2h]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : A node rebooted in the last 2 hours (instance {{ $labels.instance }})
description : "The uptime of a node changed in the last two hours. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2022-10-19 00:05:32 +00:00
- alert : PublicWifiUpstreamLost
2022-11-16 23:35:57 +00:00
expr : sum(probe_success{job="e2e_adp_clients_v4"}) == 0
2022-10-19 00:05:32 +00:00
for : 0m
labels :
severity : critical
annotations :
summary : The public wifi lost its ability to route into the internet
description : "check the vpn connection"
2022-11-19 00:55:43 +00:00
- name : ServerSpecific
rules :
# https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-7
#
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert : HostOutOfDiskSpace
expr : (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for : 2m
labels :
severity : warning
annotations :
summary : Host out of disk space (instance {{ $labels.instance }})
description : "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-9
- alert : HostOutOfInodes
expr : node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for : 2m
labels :
severity : warning
annotations :
summary : Host out of inodes (instance {{ $labels.instance }})
description : "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2022-11-21 01:58:13 +00:00
- name : Network
rules :
- alert : PortChangedState
expr : changes(ifLastChange[2h]) != 0
labels :
severity : warning
annotations :
2023-03-22 23:07:23 +00:00
summary : "{{ $labels.ifName }} on {{ $labels.instance }} changed it's state {{ $value }}x time(s) in the last 2 hours"
description : "This alarm will clear in 2 hours"
2022-12-22 15:32:56 +00:00
2023-03-22 22:53:39 +00:00
- alert : PortIfInErrors
2023-04-18 19:00:04 +00:00
expr : increase(ifInErrors[2h]) > 0 or increase(node_network_receive_errs_total[2h]) > 0
2023-03-22 22:53:39 +00:00
labels :
severity : critical
annotations :
2023-04-18 19:00:04 +00:00
summary : "{{ if $labels.ifName }} {{ $labels.ifName }} {{ else }} {{ $labels.device }} {{ end }} on {{ $labels.instance }} has {{ $value }} ifInErrors in the last 2 hours. This alarm will clear automatically in 2 hours"
description : "For some reason the port is throwing ifInErrors"
2023-03-22 22:53:39 +00:00
- alert : PortIfOutErrors
2023-04-18 19:00:04 +00:00
expr : increase(ifOutErrors[2h]) > 0 or increase(node_network_transmit_errs_total[2h]) > 0
2023-03-22 22:53:39 +00:00
labels :
severity : critical
annotations :
2023-04-18 19:00:04 +00:00
summary : "{{ if $labels.ifName }} {{ $labels.ifName }} {{ else }} {{ $labels.device }} {{ end }} on {{ $labels.instance }} has {{ $value }} ifOutErrors in the last 2 hours"
description : "For some reason the port is throwing ifOutErrors. This alarm will clear automatically in 2 hours"
2023-03-22 22:53:39 +00:00
2022-12-22 15:32:56 +00:00
- alert : SNMPNodeRebooted
expr : (sysUpTime / 100) <= (60 * 60 * 2)
labels :
severity : critical
annotations :
2023-03-22 23:07:23 +00:00
summary : "{{ $labels.instance }} rebooted at least one time in the last two hours"
description : "This alarm will clear in 2 hours"