From 16643efe46c8d60c64dc24510707ff3cb0e74792 Mon Sep 17 00:00:00 2001 From: Kaare Hoff Skovgaard Date: Fri, 15 Aug 2025 00:03:20 +0200 Subject: [PATCH] Attempt to bring up email notifications for alerts The email sending works, but apparently because the alerting rules are not grafana managed, grafana is not sending alerts on them. --- .../mailserver/accounts.nix | 4 +- .../mx.kaareskovgaard.net/users.nix | 5 ++ .../alerts/http.yaml | 4 +- .../alerts/job_up.nix | 2 +- .../monitoring.kaareskovgaard.net/default.nix | 60 ++++++++++++++++++- 5 files changed, 67 insertions(+), 8 deletions(-) diff --git a/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/accounts.nix b/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/accounts.nix index 22515e8..6206ac2 100644 --- a/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/accounts.nix +++ b/nix/systems/aarch64-linux/mx.kaareskovgaard.net/mailserver/accounts.nix @@ -131,11 +131,11 @@ let ) [ ] cfg.accounts; systemAccountsPassDbTemplateContents = - lib.concatStringsSep "\n" ( + lib.concatStrings ( lib.lists.map (account: '' {{- with secret "mx.kaareskovgaard.net/data/users/${account}" -}} ${account}:{{ .Data.data.hashed_password }}:::::: - {{- end -}} + {{ end -}} '') systemAccounts ) # Just make sure the file is not empty diff --git a/nix/systems/aarch64-linux/mx.kaareskovgaard.net/users.nix b/nix/systems/aarch64-linux/mx.kaareskovgaard.net/users.nix index e5ef876..a0a5b81 100644 --- a/nix/systems/aarch64-linux/mx.kaareskovgaard.net/users.nix +++ b/nix/systems/aarch64-linux/mx.kaareskovgaard.net/users.nix @@ -15,4 +15,9 @@ addresses = [ "git@kas.codes" ]; sendOnly = true; }; + "monitoring" = { + name = "monitoring.kaareskovgaard.net"; + addresses = [ "monitoring@kas.codes" ]; + sendOnly = true; + }; } diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml index a0c6e88..efa369d 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml @@ -10,8 +10,8 @@ summary: "Nginx on {{ $labels.instance }} is down" - alert: HighHttpErrorRate expr: > - sum by(vhost, instance) (rate(http_response_count_total{status=~"5..",job="nginxlog"}[1m])) / - sum by(vhost, instance) (rate(http_response_count_total{job="nginxlog"}[1m])) + sum by(vhost, instance) (rate(http_response_count_total{status=~"5..",job="nginxlog"}[10m])) / + sum by(vhost, instance) (rate(http_response_count_total{job="nginxlog"}[10m])) > 0 for: 30m labels: diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/job_up.nix b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/job_up.nix index 7ce7dac..161972b 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/job_up.nix +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/job_up.nix @@ -29,7 +29,7 @@ in - alert: JobDown expr: > ${exprs} - for: 10m + for: 1m labels: severity: critical annotations: diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix index 6cf40f1..18a6f36 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix @@ -52,6 +52,7 @@ let ); oauthCredentialFile = config.khscodes.infrastructure.kanidm-client-application.secretFile; + smtpPasswordFile = "/run/secret/grafana/smtp/password"; in { imports = [ @@ -61,6 +62,7 @@ in systemd.services.grafana = { unitConfig.ConditionPathExists = [ oauthCredentialFile + smtpPasswordFile ]; }; services.grafana = { @@ -102,6 +104,16 @@ in org_mapping = "*:*:Admin"; role_attribute_path = "'GrafanaAdmin'"; }; + + smtp = { + enabled = true; + from_name = "monitoring.kaareskovgaard.net"; + from_address = "monitoring@kas.codes"; + ehlo_identity = "monitoring.kaareskovgaard.net"; + host = "mx.kaareskovgaard.net:465"; + password = "$__file{${smtpPasswordFile}}"; + user = "monitoring"; + }; }; provision = { enable = true; @@ -156,6 +168,30 @@ in options.path = postfixDashboard; } ]; + alerting = { + contactPoints.settings.contactPoints = [ + { + orgId = 1; + name = "Default"; + receivers = [ + { + uid = "khs"; + type = "email"; + settings = { + addresses = "kaare@kaareskovgaard.net"; + }; + } + ]; + } + ]; + policies.settings.policies = [ + { + orgId = 1; + receiver = "Default"; + group_by = [ "instance" ]; + } + ]; + }; }; }; services.prometheus = { @@ -163,9 +199,10 @@ in listenAddress = "127.0.0.1"; extraFlags = [ "--web.enable-otlp-receiver" - "--storage.tsdb.retention.time=15d" + "--storage.tsdb.retention.time=60d" ]; - # alertmanager.enable = true; + # I think I need to move these into grafana managed rules, in order to get notifications + # working properly. rules = [ '' groups: @@ -227,7 +264,7 @@ in compactor = { retention_enabled = true; compaction_interval = "24h"; - retention_delete_delay = "15d"; + retention_delete_delay = "${builtins.toString (60 * 24)}h"; delete_request_store = "filesystem"; working_directory = "${config.services.loki.dataDir}/retention"; }; @@ -312,6 +349,18 @@ in perms = "0644"; reloadOrRestartUnits = [ "nginx.service" ]; } + { + contents = '' + {{- with secret "mx.kaareskovgaard.net/data/users/monitoring" -}} + {{ .Data.data.password }} + {{- end -}} + ''; + destination = smtpPasswordFile; + owner = "grafana"; + group = "grafana"; + perms = "0600"; + reloadOrRestartUnits = [ "grafana.service" ]; + } ]; infrastructure.kanidm-client-application = { enable = true; @@ -319,6 +368,11 @@ in secretOwner = "grafana"; reloadOrRestartUnits = [ "grafana.service" ]; }; + infrastructure.vault-server-approle.policy = { + "mx.kaareskovgaard.net/data/users/monitoring" = { + capabilities = [ "read" ]; + }; + }; }; khscodes.networking.fqdn = "monitoring.kaareskovgaard.net"; system.stateVersion = "25.05";