diff --git a/files/alerting_rules.yml b/files/alerting_rules.yml index a3eb46e..494fc2d 100644 --- a/files/alerting_rules.yml +++ b/files/alerting_rules.yml @@ -21,7 +21,7 @@ groups: description: "The uptime of a node changed in the last two hours. VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PublicWifiUpstreamLost - expr: sum(probe_success{job="e2e_clients_v4"}) == 0 + expr: sum(probe_success{job="e2e_adp_clients_v4"}) == 0 for: 0m labels: severity: critical diff --git a/playbook_provision_monitoring.yml b/playbook_provision_monitoring.yml index 59fc5b8..3f2d489 100644 --- a/playbook_provision_monitoring.yml +++ b/playbook_provision_monitoring.yml @@ -55,53 +55,28 @@ - name: provision monitoring hosts: - - monitoring01 + - eae-adp-jump01 tasks: - - name: install playbook requirements - package: - name: - - gpg - - name: install prometheus stack package: name: - prometheus - - prometheus-alertmanager - - # stolen from usr/share/prometheus/alertmanager/generate-ui.sh - # script calls apt without "-y" therefore we need to install them beforehand - - name: install dependencies for alertmanager ui generation - package: - name: - - libjs-bootstrap4 - - fonts-font-awesome - - curl - - uglifyjs - - golang-github-prometheus-alertmanager-dev + - alertmanager + - grafana - name: configure alertmanager template: src: templates/alertmanager.yml.j2 - dest: /etc/prometheus/alertmanager.yml - validate: "/usr/bin/amtool check-config %s" + dest: /etc/alertmanager/alertmanager.yml + validate: "/usr/local/bin/amtool check-config %s" notify: - - reload prometheus-alertmanager - - - name: generate alertmanager ui - shell: - cmd: /usr/share/prometheus/alertmanager/generate-ui.sh - creates: "/usr/share/prometheus/alertmanager/ui/index.html" - notify: - - restart prometheus-alertmanager + - reload alertmanager - name: configure prometheus alerting rules copy: src: files/alerting_rules.yml dest: /etc/prometheus/alerting_rules.yml - owner: root - group: root - mode: 0644 - validate: "/usr/bin/promtool check rules %s" + validate: "/usr/local/bin/promtool check rules %s" notify: - reload prometheus @@ -109,32 +84,13 @@ template: src: templates/prometheus.yml dest: /etc/prometheus/prometheus.yml - validate: "/usr/bin/promtool check config %s" + validate: "/usr/local/bin/promtool check config %s" notify: - reload prometheus - - name: add grafana oss repo gpg key - apt_key: - url: "https://packages.grafana.com/gpg.key" - id: "4E40DDF6D76E284A4A6780E48C8C34C524098CB6" - - - name: add grafana oss repo - apt_repository: - repo: "deb https://packages.grafana.com/oss/deb stable main" - - - name: install grafana oss - package: - name: grafana - - - name: enable and start grafana - service: - name: grafana-server - state: started - enabled: yes - - name: enable anonymous login in grafana blockinfile: - path: /etc/grafana/grafana.ini + path: /etc/grafana/config.ini block: | [auth.anonymous] enabled = true @@ -148,7 +104,7 @@ src: "{{ item }}" dest: /etc/grafana/provisioning/datasources/ owner: root - group: grafana + group: _grafana mode: 0640 with_fileglob: - "templates/grafana/provisioning/datasources/*" @@ -160,7 +116,7 @@ path: /etc/grafana/dashboards state: directory owner: root - group: grafana + group: _grafana mode: 0755 - name: install dashboards @@ -168,7 +124,7 @@ src: "{{ item }}" dest: /etc/grafana/dashboards/ owner: root - group: grafana + group: _grafana mode: 0640 with_fileglob: - "templates/grafana/dashboards/*" @@ -178,30 +134,33 @@ src: "{{ item }}" dest: /etc/grafana/provisioning/dashboards/ owner: root - group: grafana + group: _grafana mode: 0644 with_fileglob: - "templates/grafana/provisioning/dashboards/*" notify: - restart grafana + - name: enable and start monitoring stack + service: + name: "{{ item }}" + enabled: true + state: started + with_items: + - prometheus + - alertmanager + - grafana + handlers: - name: reload prometheus - service: - name: prometheus - state: reloaded + shell: + cmd: "kill -SIGHUP $(pgrep prometheus)" - - name: reload prometheus-alertmanager - service: - name: prometheus-alertmanager - state: reloaded - - - name: restart prometheus-alertmanager - service: - name: prometheus-alertmanager - state: restarted + - name: reload alertmanager + shell: + cmd: "kill -SIGHUP $(pgrep alertmanager)" - name: restart grafana service: - name: grafana-server + name: grafana state: restarted diff --git a/templates/alertmanager.yml.j2 b/templates/alertmanager.yml.j2 index 910144f..497471e 100644 --- a/templates/alertmanager.yml.j2 +++ b/templates/alertmanager.yml.j2 @@ -4,7 +4,7 @@ global: # The smarthost and SMTP sender used for mail notifications. smtp_smarthost: 'harald.brainpeach.de:587' - smtp_from: 'ffl-eae-adp-mon01@brainpeach.de' + smtp_from: 'ffl-eae-adp-jump01@brainpeach.de' smtp_auth_username: 'ffl-eae-adp-mon01@brainpeach.de' smtp_auth_password: '{{ lookup("passwordstore", "mailboxes/ffl-eae-adp-mon01@brainpeach.de") }}' diff --git a/templates/prometheus.yml b/templates/prometheus.yml index 03ca1ac..2b42b51 100644 --- a/templates/prometheus.yml +++ b/templates/prometheus.yml @@ -17,14 +17,18 @@ scrape_configs: scrape_timeout: 5s static_configs: - targets: ['localhost:9090'] + - targets: ["{{ hostvars['monitoring01']['ip'] }}:9090"] -{% for group in groups.keys() | difference(['all', 'ungrouped']) %} +{% for group in ['accesspoints', 'switches', 'gateways', 'server', 'vms'] %} - job_name: {{ group }} static_configs: {% for host in groups[group] %} - targets: ["{{ hostvars[host]['monitoring_ip'] | default(hostvars[host]['ip']) }}:9100"] labels: instance: "{{ host }}:9100" +{% if hostvars[host]['site'] is defined %} + site: "{{ hostvars[host]['site'] }}" +{% endif %} {% endfor %} {% endfor %} @@ -33,10 +37,8 @@ scrape_configs: static_configs: - targets: - {{ hostvars['mon-e2e-clients01']['ip'] }}:9115 - - {{ hostvars['mon-e2e-wan01']['ip'] }}:9115 - - {{ hostvars['monitoring01']['ip'] }}:9115 - - job_name: 'e2e_clients_v4' + - job_name: 'e2e_adp_clients_v4' metrics_path: /probe params: module: [icmp_v4] @@ -51,36 +53,3 @@ scrape_configs: target_label: instance - target_label: __address__ replacement: {{ hostvars['mon-e2e-clients01']['ip'] }}:9115 - - - job_name: 'e2e_default_v4' - metrics_path: /probe - params: - module: [icmp_v4] - static_configs: - - targets: - - 192.168.0.1 # gigacube - - freifunk-leipzig.de - - harald.brainpeach.de - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: {{ hostvars['monitoring01']['ip'] }}:9115 - - - job_name: 'e2e_wan_v4' - metrics_path: /probe - params: - module: [icmp_v4] - static_configs: - - targets: - - freifunk-leipzig.de - - harald.brainpeach.de - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: {{ hostvars['mon-e2e-wan01']['ip'] }}:9115