From 554761c118add624c45b49f8236d082ad43e6ea5 Mon Sep 17 00:00:00 2001 From: Kaare Hoff Skovgaard Date: Thu, 7 Aug 2025 22:49:49 +0200 Subject: [PATCH] Add some ZFS alerting rules to prometheus --- nix/modules/nixos/fs/zfs/default.nix | 3 +++ .../alerts/systemd.yaml | 2 +- .../alerts/zfs.yaml | 26 +++++++++++++++++++ .../monitoring.kaareskovgaard.net/default.nix | 1 + 4 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/zfs.yaml diff --git a/nix/modules/nixos/fs/zfs/default.nix b/nix/modules/nixos/fs/zfs/default.nix index 3ce4b0b..bf4e59d 100644 --- a/nix/modules/nixos/fs/zfs/default.nix +++ b/nix/modules/nixos/fs/zfs/default.nix @@ -138,6 +138,7 @@ in forceImportRoot = false; requestEncryptionCredentials = false; }; + services.zfs.autoScrub.enable = true; systemd.services.zfs-mount.enable = false; systemd.services.zfs-import-zroot.enable = false; systemd.services.khscodes-zpool-setup = { @@ -197,5 +198,7 @@ in restartUnits = [ "khscodes-zpool-setup.service" ]; } ]; + services.prometheus.exporters.zfs.enable = true; + khscodes.infrastructure.vault-prometheus-sender.exporters.enabled = [ "zfs" ]; }; } diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/systemd.yaml b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/systemd.yaml index e2910ae..c0e1f05 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/systemd.yaml +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/systemd.yaml @@ -7,4 +7,4 @@ labels: severity: warn annotations: - summary: "Unit {{ $labels.name}} on {{ $labels.instance }} is in failed state" + summary: "Unit {{ $labels.name }} on {{ $labels.instance }} is in failed state" diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/zfs.yaml b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/zfs.yaml new file mode 100644 index 0000000..a29245f --- /dev/null +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/zfs.yaml @@ -0,0 +1,26 @@ +- name: ZFS + rules: + - alert: ScrapeFailed + expr: > + zfs_scrape_collector_success{job="zfs"} == 0 + for: 10m + labels: + severity: warn + annotations: + summary: "Could not scrape zfs metrics on {{ $labels.instance }}" + - alert: PoolNotOnline + expr: > + zfs_pool_health{job="zfs"} != 0 + for: 10m + labels: + severity: critical + annotations: + summary: "Pool {{ $labels.pool }} on {{ $labels.instance }} is not ONLINE." + - alert: PoolReadOnly + expr: > + zfs_pool_readonly{job="zfs"} == 1 + for: 10m + labels: + severity: critical + annotations: + summary: "Pool {{ $labels.pool }} on {{ $labels.instance }} is in read only mode." diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix index 4e6b0e2..cbe142a 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix @@ -171,6 +171,7 @@ in ${builtins.readFile ./alerts/postgres.yaml} ${builtins.readFile ./alerts/systemd.yaml} ${builtins.readFile ./alerts/dovecot.yaml} + ${builtins.readFile ./alerts/zfs.yaml} ${import ./alerts/job_up.nix { inherit inputs lib; }} '' ];