Compare commits

...

8 Commits

Author SHA1 Message Date
Gregor Michels 9cfee1f384 monitoring: add alerting rules for disks running out of space 2022-11-19 01:58:14 +01:00
Gregor Michels dca1261f07 inventory: fix site for ffl-ans-gw-core01
Fixes: 4afda5bdd9
2022-11-19 01:48:11 +01:00
Gregor Michels ffb7617db8 monitoring: add 'location' info 2022-11-19 01:46:52 +01:00
Gregor Michels 8389a18488 monitoring: move prometheus stack onto eae-adp-jump01
to be able to also monitor the new site.

custom grafana dashboard broke while transfering stack.
will fix next
2022-11-17 00:35:57 +01:00
Gregor Michels 258355170b gw-core01: (adp) allow clients network to route into all other networks 2022-11-16 23:25:59 +01:00
Gregor Michels 74075f307f inventory: add site var 2022-11-16 23:17:07 +01:00
Gregor Michels d4b0e622ef gateways: fw: allow prometheus on backbone 2022-11-16 22:54:37 +01:00
Gregor Michels 2a781ae751 simply wifi password for the ans backoffice 2022-11-14 02:31:56 +01:00
9 changed files with 84 additions and 110 deletions

View File

@ -83,6 +83,7 @@ wifi_encryption=none
backoffice_wifi_ssid="GU Deutscher Platz Backoffice" backoffice_wifi_ssid="GU Deutscher Platz Backoffice"
backoffice_wifi_encryption=psk2 backoffice_wifi_encryption=psk2
backoffice_wifi_psk="{{ lookup('passwordstore', 'wifi/GU_Deutscher_Platz_Backoffice') }}" backoffice_wifi_psk="{{ lookup('passwordstore', 'wifi/GU_Deutscher_Platz_Backoffice') }}"
site=adp
[site_ans] [site_ans]
ap-b641 ap-b641
@ -96,6 +97,7 @@ ap-b634
ap-b5df ap-b5df
ap-b682 ap-b682
ap-b6cc ap-b6cc
ffl-ans-gw-core01
ffl-ans-sw-distribution01 ffl-ans-sw-distribution01
ffl-ans-sw-access01 ffl-ans-sw-access01
ffl-ans-sw-access02 ffl-ans-sw-access02
@ -108,3 +110,4 @@ backoffice_wifi_ssid="GU Arno-Nitzsche-Strasse BO"
backoffice_wifi_encryption=psk2 backoffice_wifi_encryption=psk2
backoffice_wifi_psk="{{ lookup('passwordstore', 'wifi/GU_Arno-Nitzsche-Straße_Backoffice') }}" backoffice_wifi_psk="{{ lookup('passwordstore', 'wifi/GU_Arno-Nitzsche-Straße_Backoffice') }}"
mgmt_gateway=10.85.1.1 mgmt_gateway=10.85.1.1
site=ans

View File

@ -21,10 +21,36 @@ groups:
description: "The uptime of a node changed in the last two hours. VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The uptime of a node changed in the last two hours. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PublicWifiUpstreamLost - alert: PublicWifiUpstreamLost
expr: sum(probe_success{job="e2e_clients_v4"}) == 0 expr: sum(probe_success{job="e2e_adp_clients_v4"}) == 0
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: The public wifi lost its ability to route into the internet summary: The public wifi lost its ability to route into the internet
description: "check the vpn connection" description: "check the vpn connection"
- name: ServerSpecific
rules:
# https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-7
#
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-9
- alert: HostOutOfInodes
expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -55,53 +55,28 @@
- name: provision monitoring - name: provision monitoring
hosts: hosts:
- monitoring01 - eae-adp-jump01
tasks: tasks:
- name: install playbook requirements
package:
name:
- gpg
- name: install prometheus stack - name: install prometheus stack
package: package:
name: name:
- prometheus - prometheus
- prometheus-alertmanager - alertmanager
- grafana
# stolen from usr/share/prometheus/alertmanager/generate-ui.sh
# script calls apt without "-y" therefore we need to install them beforehand
- name: install dependencies for alertmanager ui generation
package:
name:
- libjs-bootstrap4
- fonts-font-awesome
- curl
- uglifyjs
- golang-github-prometheus-alertmanager-dev
- name: configure alertmanager - name: configure alertmanager
template: template:
src: templates/alertmanager.yml.j2 src: templates/alertmanager.yml.j2
dest: /etc/prometheus/alertmanager.yml dest: /etc/alertmanager/alertmanager.yml
validate: "/usr/bin/amtool check-config %s" validate: "/usr/local/bin/amtool check-config %s"
notify: notify:
- reload prometheus-alertmanager - reload alertmanager
- name: generate alertmanager ui
shell:
cmd: /usr/share/prometheus/alertmanager/generate-ui.sh
creates: "/usr/share/prometheus/alertmanager/ui/index.html"
notify:
- restart prometheus-alertmanager
- name: configure prometheus alerting rules - name: configure prometheus alerting rules
copy: copy:
src: files/alerting_rules.yml src: files/alerting_rules.yml
dest: /etc/prometheus/alerting_rules.yml dest: /etc/prometheus/alerting_rules.yml
owner: root validate: "/usr/local/bin/promtool check rules %s"
group: root
mode: 0644
validate: "/usr/bin/promtool check rules %s"
notify: notify:
- reload prometheus - reload prometheus
@ -109,32 +84,13 @@
template: template:
src: templates/prometheus.yml src: templates/prometheus.yml
dest: /etc/prometheus/prometheus.yml dest: /etc/prometheus/prometheus.yml
validate: "/usr/bin/promtool check config %s" validate: "/usr/local/bin/promtool check config %s"
notify: notify:
- reload prometheus - reload prometheus
- name: add grafana oss repo gpg key
apt_key:
url: "https://packages.grafana.com/gpg.key"
id: "4E40DDF6D76E284A4A6780E48C8C34C524098CB6"
- name: add grafana oss repo
apt_repository:
repo: "deb https://packages.grafana.com/oss/deb stable main"
- name: install grafana oss
package:
name: grafana
- name: enable and start grafana
service:
name: grafana-server
state: started
enabled: yes
- name: enable anonymous login in grafana - name: enable anonymous login in grafana
blockinfile: blockinfile:
path: /etc/grafana/grafana.ini path: /etc/grafana/config.ini
block: | block: |
[auth.anonymous] [auth.anonymous]
enabled = true enabled = true
@ -148,7 +104,7 @@
src: "{{ item }}" src: "{{ item }}"
dest: /etc/grafana/provisioning/datasources/ dest: /etc/grafana/provisioning/datasources/
owner: root owner: root
group: grafana group: _grafana
mode: 0640 mode: 0640
with_fileglob: with_fileglob:
- "templates/grafana/provisioning/datasources/*" - "templates/grafana/provisioning/datasources/*"
@ -160,7 +116,7 @@
path: /etc/grafana/dashboards path: /etc/grafana/dashboards
state: directory state: directory
owner: root owner: root
group: grafana group: _grafana
mode: 0755 mode: 0755
- name: install dashboards - name: install dashboards
@ -168,7 +124,7 @@
src: "{{ item }}" src: "{{ item }}"
dest: /etc/grafana/dashboards/ dest: /etc/grafana/dashboards/
owner: root owner: root
group: grafana group: _grafana
mode: 0640 mode: 0640
with_fileglob: with_fileglob:
- "templates/grafana/dashboards/*" - "templates/grafana/dashboards/*"
@ -178,30 +134,33 @@
src: "{{ item }}" src: "{{ item }}"
dest: /etc/grafana/provisioning/dashboards/ dest: /etc/grafana/provisioning/dashboards/
owner: root owner: root
group: grafana group: _grafana
mode: 0644 mode: 0644
with_fileglob: with_fileglob:
- "templates/grafana/provisioning/dashboards/*" - "templates/grafana/provisioning/dashboards/*"
notify: notify:
- restart grafana - restart grafana
- name: enable and start monitoring stack
service:
name: "{{ item }}"
enabled: true
state: started
with_items:
- prometheus
- alertmanager
- grafana
handlers: handlers:
- name: reload prometheus - name: reload prometheus
service: shell:
name: prometheus cmd: "kill -SIGHUP $(pgrep prometheus)"
state: reloaded
- name: reload prometheus-alertmanager - name: reload alertmanager
service: shell:
name: prometheus-alertmanager cmd: "kill -SIGHUP $(pgrep alertmanager)"
state: reloaded
- name: restart prometheus-alertmanager
service:
name: prometheus-alertmanager
state: restarted
- name: restart grafana - name: restart grafana
service: service:
name: grafana-server name: grafana
state: restarted state: restarted

View File

@ -4,7 +4,7 @@
global: global:
# The smarthost and SMTP sender used for mail notifications. # The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'harald.brainpeach.de:587' smtp_smarthost: 'harald.brainpeach.de:587'
smtp_from: 'ffl-eae-adp-mon01@brainpeach.de' smtp_from: 'ffl-eae-adp-jump01@brainpeach.de'
smtp_auth_username: 'ffl-eae-adp-mon01@brainpeach.de' smtp_auth_username: 'ffl-eae-adp-mon01@brainpeach.de'
smtp_auth_password: '{{ lookup("passwordstore", "mailboxes/ffl-eae-adp-mon01@brainpeach.de") }}' smtp_auth_password: '{{ lookup("passwordstore", "mailboxes/ffl-eae-adp-mon01@brainpeach.de") }}'

View File

@ -51,6 +51,13 @@ config rule
option proto ospf option proto ospf
option target ACCEPT option target ACCEPT
config rule
option name From-BACKBONE-Allow-Prometheus
option src backbone
option proto tcp
option dest_port 9100
option target ACCEPT
config rule config rule
option name From-Any-Allow-SSH option name From-Any-Allow-SSH
option src * option src *

View File

@ -63,6 +63,13 @@ config rule
option proto ospf option proto ospf
option target ACCEPT option target ACCEPT
config rule
option name From-BACKBONE-Allow-Prometheus
option src backbone
option proto tcp
option dest_port 9100
option target ACCEPT
config rule config rule
option name From-Any-Allow-SSH option name From-Any-Allow-SSH
option src * option src *

View File

@ -118,7 +118,7 @@ config wireguard_wg1 'mullvad_fr'
config rule config rule
option in 'clients' option in 'clients'
option dest '10.84.1.0/24' option dest '10.0.0.0/8'
option lookup 'main' option lookup 'main'
option priority 49 option priority 49
option disabled '0' option disabled '0'

View File

@ -17,14 +17,21 @@ scrape_configs:
scrape_timeout: 5s scrape_timeout: 5s
static_configs: static_configs:
- targets: ['localhost:9090'] - targets: ['localhost:9090']
- targets: ["{{ hostvars['monitoring01']['ip'] }}:9090"]
{% for group in groups.keys() | difference(['all', 'ungrouped']) %} {% for group in ['accesspoints', 'switches', 'gateways', 'server', 'vms'] %}
- job_name: {{ group }} - job_name: {{ group }}
static_configs: static_configs:
{% for host in groups[group] %} {% for host in groups[group] %}
- targets: ["{{ hostvars[host]['monitoring_ip'] | default(hostvars[host]['ip']) }}:9100"] - targets: ["{{ hostvars[host]['monitoring_ip'] | default(hostvars[host]['ip']) }}:9100"]
labels: labels:
instance: "{{ host }}:9100" instance: "{{ host }}:9100"
{% if hostvars[host]['site'] is defined %}
site: "{{ hostvars[host]['site'] }}"
{% endif %}
{% if hostvars[host]['location'] is defined %}
location: "{{ hostvars[host]['location'] }}"
{% endif %}
{% endfor %} {% endfor %}
{% endfor %} {% endfor %}
@ -33,10 +40,8 @@ scrape_configs:
static_configs: static_configs:
- targets: - targets:
- {{ hostvars['mon-e2e-clients01']['ip'] }}:9115 - {{ hostvars['mon-e2e-clients01']['ip'] }}:9115
- {{ hostvars['mon-e2e-wan01']['ip'] }}:9115
- {{ hostvars['monitoring01']['ip'] }}:9115
- job_name: 'e2e_clients_v4' - job_name: 'e2e_adp_clients_v4'
metrics_path: /probe metrics_path: /probe
params: params:
module: [icmp_v4] module: [icmp_v4]
@ -51,36 +56,3 @@ scrape_configs:
target_label: instance target_label: instance
- target_label: __address__ - target_label: __address__
replacement: {{ hostvars['mon-e2e-clients01']['ip'] }}:9115 replacement: {{ hostvars['mon-e2e-clients01']['ip'] }}:9115
- job_name: 'e2e_default_v4'
metrics_path: /probe
params:
module: [icmp_v4]
static_configs:
- targets:
- 192.168.0.1 # gigacube
- freifunk-leipzig.de
- harald.brainpeach.de
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: {{ hostvars['monitoring01']['ip'] }}:9115
- job_name: 'e2e_wan_v4'
metrics_path: /probe
params:
module: [icmp_v4]
static_configs:
- targets:
- freifunk-leipzig.de
- harald.brainpeach.de
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: {{ hostvars['mon-e2e-wan01']['ip'] }}:9115