From 619984cd89502a879d9b260c30e8d2c1d3a41830 Mon Sep 17 00:00:00 2001 From: Kaare Hoff Skovgaard Date: Sun, 17 Aug 2025 00:42:33 +0200 Subject: [PATCH] Get email alerting working --- .../mailserver/default.nix | 2 +- .../alerts/http.yaml | 4 +- .../monitoring.kaareskovgaard.net/default.nix | 99 +++++++++++-------- 3 files changed, 63 insertions(+), 42 deletions(-) diff --git a/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/default.nix b/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/default.nix index 0f07544..3752684 100644 --- a/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/default.nix +++ b/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/default.nix @@ -90,7 +90,7 @@ in enable = true; enableImap = false; enableImapSsl = true; - enableSubmission = false; + enableSubmission = true; enableSubmissionSsl = true; fqdn = config.khscodes.networking.fqdn; useUTF8FolderNames = true; diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml index efa369d..3c7f71a 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml @@ -12,9 +12,9 @@ expr: > sum by(vhost, instance) (rate(http_response_count_total{status=~"5..",job="nginxlog"}[10m])) / sum by(vhost, instance) (rate(http_response_count_total{job="nginxlog"}[10m])) - > 0 + > 0.05 for: 30m labels: severity: critical annotations: - summary: "Nginx VHost {{ $labels.vhost }} on {{ $labels.instance }} is generating many internal server errors over 30 minutes" + summary: "Nginx VHost {{ $labels.vhost }} on {{ $labels.instance }} is generating over 5% internal server errors over 30 minutes" diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix index a73190c..8ecec93 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix @@ -62,7 +62,6 @@ in systemd.services.grafana = { unitConfig.ConditionPathExists = [ oauthCredentialFile - smtpPasswordFile ]; }; services.grafana = { @@ -104,16 +103,6 @@ in org_mapping = "*:*:Admin"; role_attribute_path = "'GrafanaAdmin'"; }; - - smtp = { - enabled = true; - from_name = "monitoring.kaareskovgaard.net"; - from_address = "monitoring@kas.codes"; - ehlo_identity = "monitoring.kaareskovgaard.net"; - host = "mx.kaareskovgaard.net:465"; - password = "$__file{${smtpPasswordFile}}"; - user = "monitoring"; - }; }; provision = { enable = true; @@ -145,6 +134,15 @@ in manageAlerts = true; }; } + { + name = "Alertmanager"; + type = "alertmanager"; + url = "http://127.0.0.1:${toString config.services.prometheus.alertmanager.port}"; + jsonData = { + implementation = "prometheus"; + handleGrafanaManagedAlerts = true; + }; + } ]; dashboards.settings.providers = [ { @@ -168,39 +166,55 @@ in options.path = postfixDashboard; } ]; - alerting = { - contactPoints.settings.contactPoints = [ - { - orgId = 1; - name = "grafana-default-email"; - receivers = [ - { - uid = "khs"; - type = "email"; - settings = { - addresses = "kaare@kaareskovgaard.net"; - }; - } - ]; - } - ]; - policies.settings.policies = [ - { - orgId = 1; - receiver = "grafana-default-email"; - group_by = [ "instance" ]; - } - ]; - }; }; }; services.prometheus = { enable = true; listenAddress = "127.0.0.1"; + alertmanagers = [ + { + static_configs = [ + { + targets = [ "localhost:${toString config.services.prometheus.alertmanager.port}" ]; + } + ]; + } + ]; extraFlags = [ "--web.enable-otlp-receiver" "--storage.tsdb.retention.time=60d" ]; + alertmanager = { + enable = true; + environmentFile = smtpPasswordFile; + logLevel = "info"; + configuration = { + global = { + smtp_from = "monitoring@kas.codes"; + smtp_smarthost = "mx.kaareskovgaard.net:587"; + smtp_hello = "monitoring.kaareskovgaard.net"; + smtp_auth_username = "monitoring"; + smtp_auth_password = "$SMTP_PASSWORD"; + }; + route = { + receiver = "email"; + group_by = [ + "instance" + ]; + }; + receivers = [ + { + name = "email"; + email_configs = [ + { + send_resolved = true; + to = "kaare@kaareskovgaard.net"; + } + ]; + } + ]; + }; + }; # I think I need to move these into grafana managed rules, in order to get notifications # working properly. rules = [ @@ -217,6 +231,11 @@ in '' ]; }; + systemd.services.alertmanager = { + unitConfig = { + ConditionPathExists = [ smtpPasswordFile ]; + }; + }; services.loki = { enable = true; configuration = { @@ -306,6 +325,7 @@ in verify = "on"; certificate = "/etc/loki/client-signer.pem"; }; + rateLimit.enable = false; locations."/" = { proxyPass = "http://${loki.configuration.server.http_listen_address}:${toString loki.configuration.server.http_listen_port}"; proxyWebsockets = true; @@ -317,6 +337,7 @@ in verify = "on"; certificate = "/etc/prometheus/client-signer.pem"; }; + rateLimit.enable = false; locations."/" = { proxyPass = "http://${prometheus.listenAddress}:${toString prometheus.port}"; proxyWebsockets = true; @@ -352,14 +373,14 @@ in { contents = '' {{- with secret "mx.kaareskovgaard.net/data/users/monitoring" -}} - {{ .Data.data.password }} + SMTP_PASSWORD={{ .Data.data.password }} {{- end -}} ''; destination = smtpPasswordFile; - owner = "grafana"; - group = "grafana"; + owner = "root"; + group = "root"; perms = "0600"; - reloadOrRestartUnits = [ "grafana.service" ]; + reloadOrRestartUnits = [ "alertmanager.service" ]; } ]; infrastructure.kanidm-client-application = {