Add some ZFS alerting rules to prometheus
Some checks failed
/ rust-packages (push) Failing after 41s
/ check (push) Failing after 47s
/ dev-shell (push) Successful in 49s
/ systems (push) Successful in 4m11s
/ terraform-providers (push) Successful in 1m25s

This commit is contained in:
Kaare Hoff Skovgaard 2025-08-07 22:49:49 +02:00
parent 5abaa9322e
commit 554761c118
Signed by: khs
GPG key ID: C7D890804F01E9F0
4 changed files with 31 additions and 1 deletions

View file

@ -138,6 +138,7 @@ in
forceImportRoot = false; forceImportRoot = false;
requestEncryptionCredentials = false; requestEncryptionCredentials = false;
}; };
services.zfs.autoScrub.enable = true;
systemd.services.zfs-mount.enable = false; systemd.services.zfs-mount.enable = false;
systemd.services.zfs-import-zroot.enable = false; systemd.services.zfs-import-zroot.enable = false;
systemd.services.khscodes-zpool-setup = { systemd.services.khscodes-zpool-setup = {
@ -197,5 +198,7 @@ in
restartUnits = [ "khscodes-zpool-setup.service" ]; restartUnits = [ "khscodes-zpool-setup.service" ];
} }
]; ];
services.prometheus.exporters.zfs.enable = true;
khscodes.infrastructure.vault-prometheus-sender.exporters.enabled = [ "zfs" ];
}; };
} }

View file

@ -0,0 +1,26 @@
- name: ZFS
rules:
- alert: ScrapeFailed
expr: >
zfs_scrape_collector_success{job="zfs"} == 0
for: 10m
labels:
severity: warn
annotations:
summary: "Could not scrape zfs metrics on {{ $labels.instance }}"
- alert: PoolNotOnline
expr: >
zfs_pool_health{job="zfs"} != 0
for: 10m
labels:
severity: critical
annotations:
summary: "Pool {{ $labels.pool }} on {{ $labels.instance }} is not ONLINE."
- alert: PoolReadOnly
expr: >
zfs_pool_readonly{job="zfs"} == 1
for: 10m
labels:
severity: critical
annotations:
summary: "Pool {{ $labels.pool }} on {{ $labels.instance }} is in read only mode."

View file

@ -171,6 +171,7 @@ in
${builtins.readFile ./alerts/postgres.yaml} ${builtins.readFile ./alerts/postgres.yaml}
${builtins.readFile ./alerts/systemd.yaml} ${builtins.readFile ./alerts/systemd.yaml}
${builtins.readFile ./alerts/dovecot.yaml} ${builtins.readFile ./alerts/dovecot.yaml}
${builtins.readFile ./alerts/zfs.yaml}
${import ./alerts/job_up.nix { inherit inputs lib; }} ${import ./alerts/job_up.nix { inherit inputs lib; }}
'' ''
]; ];