Monitoring: Add scrape down alerts
All checks were successful
/ dev-shell (push) Successful in 43s
/ rust-packages (push) Successful in 49s
/ check (push) Successful in 2m32s
/ terraform-providers (push) Successful in 48s
/ systems (push) Successful in 4m3s

This commit is contained in:
Kaare Hoff Skovgaard 2025-08-01 23:07:48 +02:00
parent af583fcbd4
commit 28f4b34cd1
Signed by: khs
GPG key ID: C7D890804F01E9F0
5 changed files with 78 additions and 32 deletions

View file

@ -19,11 +19,18 @@ in
description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform"; description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform";
default = "prometheus-mtls"; default = "prometheus-mtls";
}; };
exporters.enabled = lib.mkOption { exporters = {
enabled = lib.mkOption {
type = lib.types.listOf lib.types.str; type = lib.types.listOf lib.types.str;
default = [ ]; default = [ ];
description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings."; description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings.";
}; };
external = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ "node_exporter" ];
description = "Externally managed exporters. This should be the name of the job of that exporter. Adding it to this list will get monitoring of when the exporter goes down working";
};
};
}; };
config = lib.mkIf cfg.enable { config = lib.mkIf cfg.enable {
@ -82,8 +89,7 @@ in
PROMETHEUS_CLIENT_CERT = client_cert; PROMETHEUS_CLIENT_CERT = client_cert;
}; };
}; };
environment.etc = environment.etc = {
{
"alloy/prometheus.alloy" = { "alloy/prometheus.alloy" = {
source = ./prometheus.alloy; source = ./prometheus.alloy;
}; };

View file

@ -181,6 +181,7 @@ in
}; };
users.groups.git = { }; users.groups.git = { };
khscodes.infrastructure.vault-prometheus-sender.exporters.external = [ "forgejo" ];
environment.etc."alloy/forgejo_prometheus.alloy" = { environment.etc."alloy/forgejo_prometheus.alloy" = {
text = '' text = ''
prometheus.scrape "forgejo_exporter" { prometheus.scrape "forgejo_exporter" {

View file

@ -0,0 +1,37 @@
{ inputs, lib }:
let
jobs = lib.attrsets.foldlAttrs (
acc: name: nixos:
let
vault-prometheus-sender = nixos.config.khscodes.infrastructure.vault-prometheus-sender;
instance = nixos.config.khscodes.networking.fqdn;
instanceJobs =
vault-prometheus-sender.exporters.enabled ++ vault-prometheus-sender.exporters.external;
expr = lib.strings.concatMapStringsSep " or " (
job: ''absent_over_time(up{instance="${instance}", job="${job}"}[2m])''
) instanceJobs;
in
acc
++ (
if vault-prometheus-sender.enable then
[
expr
]
else
[ ]
)
) [ ] inputs.self.nixosConfigurations;
exprs = lib.strings.concatStringsSep " or " jobs;
in
''
- name: Scraping
rules:
- alert: JobDown
expr: >
${exprs}
for: 10m
labels:
severity: critical
annotations:
summary: "Scrape job {{ $labels.job }} on {{ $labels.instance }} is down"
''

View file

@ -2,6 +2,7 @@
inputs, inputs,
config, config,
pkgs, pkgs,
lib,
... ...
}: }:
let let
@ -126,6 +127,7 @@ in
${builtins.readFile ./alerts/postfix.yaml} ${builtins.readFile ./alerts/postfix.yaml}
${builtins.readFile ./alerts/postgres.yaml} ${builtins.readFile ./alerts/postgres.yaml}
${builtins.readFile ./alerts/systemd.yaml} ${builtins.readFile ./alerts/systemd.yaml}
${import ./alerts/job_up.nix { inherit inputs lib; }}
'' ''
]; ];
}; };