groups:
  - name: Basic
    rules:
        # from https://awesome-prometheus-alerts.grep.to/rules.html#rule-prometheus-self-monitoring-1-2
        - alert: PrometheusTargetMissing
          expr: up == 0
          for: 1m
          labels:
            severity: critical
          annotations:
            summary: Prometheus target missing (instance {{ $labels.instance }})
            description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

        - alert: NodeRebooted
          expr: changes(node_boot_time_seconds[2h]) > 0
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: A node rebooted in the last 2 hours (instance {{ $labels.instance }})
            description: "The uptime of a node changed in the last two hours. VALUE = {{ $value }}\n LABELS = {{ $labels }}"

        - alert: PublicWifiUpstreamLost
          expr: sum(probe_success{job="e2e_adp_clients_v4"}) == 0
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: The public wifi lost its ability to route into the internet
            description: "check the vpn connection"

  - name: ServerSpecific
    rules:
        # https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-7
        #
        # Please add ignored mountpoints in node_exporter parameters like
        # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
        # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
        - alert: HostOutOfDiskSpace
          expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host out of disk space (instance {{ $labels.instance }})
            description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

        # https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-9
        - alert: HostOutOfInodes
          expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host out of inodes (instance {{ $labels.instance }})
            description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  - name: Network
    rules:
      - alert: PortChangedState
        expr: changes(ifLastChange[2h]) != 0
        labels:
          severity: warning
        annotations:
          summary: A switch port changed it's state {{ $value }}x time
          description: "For some reason a switch port changed it's state\n LABELS = {{ $labels }}"

      - alert: PortIfInErrors
        expr: ifInErrors > 0
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.ifName }} on {{ $labels.instance }} has {{ $value }} ifInErrors"
          description: "For some reason the port is throwing ifInErrors. Clear port counters to clear the alarm"

      - alert: PortIfOutErrors
        expr: ifOutErrors > 0
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.ifName }} on {{ $labels.instance }} has {{ $value }} ifOutErrors"
          description: "For some reason the port is throwing ifOutErrors. Clear port counters to clear the alarm"

      - alert: SNMPNodeRebooted
        expr: (sysUpTime / 100) <= (60 * 60 * 2)
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: A snmp node rebooted in the last 2 hours (instance {{ $labels.instance }})
          description: "The uptime of a snmp node changed in the last two hours. VALUE = {{ $value }}\n LABELS = {{ $labels }}"