91 lines
4.2 KiB
YAML
91 lines
4.2 KiB
YAML
groups:
|
|
- name: Basic
|
|
rules:
|
|
# from https://awesome-prometheus-alerts.grep.to/rules.html#rule-prometheus-self-monitoring-1-2
|
|
- alert: PrometheusTargetMissing
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Prometheus target missing (instance {{ $labels.instance }})
|
|
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NodeRebooted
|
|
expr: changes(node_boot_time_seconds[2h]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: A node rebooted in the last 2 hours (instance {{ $labels.instance }})
|
|
description: "The uptime of a node changed in the last two hours. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: PublicWifiUpstreamLost
|
|
expr: sum(probe_success{job="e2e_adp_clients_v4"}) == 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: The public wifi lost its ability to route into the internet
|
|
description: "check the vpn connection"
|
|
|
|
- name: ServerSpecific
|
|
rules:
|
|
# https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-7
|
|
#
|
|
# Please add ignored mountpoints in node_exporter parameters like
|
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
|
- alert: HostOutOfDiskSpace
|
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-9
|
|
- alert: HostOutOfInodes
|
|
expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- name: Network
|
|
rules:
|
|
- alert: PortChangedState
|
|
expr: changes(ifLastChange[2h]) != 0
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.ifName }} on {{ $labels.instance }} changed it's state {{ $value }}x time(s) in the last 2 hours"
|
|
description: "This alarm will clear in 2 hours"
|
|
|
|
- alert: PortIfInErrors
|
|
expr: increase(ifInErrors[2h]) > 0 or increase(node_network_receive_errs_total[2h]) > 0
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ if $labels.ifName }} {{ $labels.ifName }} {{ else }} {{ $labels.device }} {{ end }} on {{ $labels.instance }} has {{ $value }} ifInErrors in the last 2 hours. This alarm will clear automatically in 2 hours"
|
|
description: "For some reason the port is throwing ifInErrors"
|
|
|
|
- alert: PortIfOutErrors
|
|
expr: increase(ifOutErrors[2h]) > 0 or increase(node_network_transmit_errs_total[2h]) > 0
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ if $labels.ifName }} {{ $labels.ifName }} {{ else }} {{ $labels.device }} {{ end }} on {{ $labels.instance }} has {{ $value }} ifOutErrors in the last 2 hours"
|
|
description: "For some reason the port is throwing ifOutErrors. This alarm will clear automatically in 2 hours"
|
|
|
|
- alert: SNMPNodeRebooted
|
|
expr: (sysUpTime / 100) <= (60 * 60 * 2)
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ $labels.instance }} rebooted at least one time in the last two hours"
|
|
description: "This alarm will clear in 2 hours"
|