Monitoring: Add scrape down alerts
All checks were successful
/ dev-shell (push) Successful in 43s
/ rust-packages (push) Successful in 49s
/ check (push) Successful in 2m32s
/ terraform-providers (push) Successful in 48s
/ systems (push) Successful in 4m3s

This commit is contained in:
Kaare Hoff Skovgaard 2025-08-01 23:07:48 +02:00
parent af583fcbd4
commit 28f4b34cd1
Signed by: khs
GPG key ID: C7D890804F01E9F0
5 changed files with 78 additions and 32 deletions

View file

@ -19,10 +19,17 @@ in
description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform";
default = "prometheus-mtls";
};
exporters.enabled = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ ];
description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings.";
exporters = {
enabled = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ ];
description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings.";
};
external = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ "node_exporter" ];
description = "Externally managed exporters. This should be the name of the job of that exporter. Adding it to this list will get monitoring of when the exporter goes down working";
};
};
};
@ -82,35 +89,34 @@ in
PROMETHEUS_CLIENT_CERT = client_cert;
};
};
environment.etc =
{
"alloy/prometheus.alloy" = {
source = ./prometheus.alloy;
};
}
// lib.listToAttrs (
lib.lists.map (
name:
let
value = config.services.prometheus.exporters.${name};
in
{
name = "alloy/prometheus_${name}.alloy";
value = {
text = ''
prometheus.scrape "exporter_${name}" {
targets = [
{"__address__" = "127.0.0.1:${toString value.port}", "instance" = constants.hostname, "job" = "${name}"},
]
environment.etc = {
"alloy/prometheus.alloy" = {
source = ./prometheus.alloy;
};
}
// lib.listToAttrs (
lib.lists.map (
name:
let
value = config.services.prometheus.exporters.${name};
in
{
name = "alloy/prometheus_${name}.alloy";
value = {
text = ''
prometheus.scrape "exporter_${name}" {
targets = [
{"__address__" = "127.0.0.1:${toString value.port}", "instance" = constants.hostname, "job" = "${name}"},
]
scrape_interval = "1m"
scrape_interval = "1m"
forward_to = [otelcol.receiver.prometheus.default.receiver]
}
'';
};
}
) cfg.exporters.enabled
);
forward_to = [otelcol.receiver.prometheus.default.receiver]
}
'';
};
}
) cfg.exporters.enabled
);
};
}

View file

@ -181,6 +181,7 @@ in
};
users.groups.git = { };
khscodes.infrastructure.vault-prometheus-sender.exporters.external = [ "forgejo" ];
environment.etc."alloy/forgejo_prometheus.alloy" = {
text = ''
prometheus.scrape "forgejo_exporter" {

View file

@ -0,0 +1,37 @@
{ inputs, lib }:
let
jobs = lib.attrsets.foldlAttrs (
acc: name: nixos:
let
vault-prometheus-sender = nixos.config.khscodes.infrastructure.vault-prometheus-sender;
instance = nixos.config.khscodes.networking.fqdn;
instanceJobs =
vault-prometheus-sender.exporters.enabled ++ vault-prometheus-sender.exporters.external;
expr = lib.strings.concatMapStringsSep " or " (
job: ''absent_over_time(up{instance="${instance}", job="${job}"}[2m])''
) instanceJobs;
in
acc
++ (
if vault-prometheus-sender.enable then
[
expr
]
else
[ ]
)
) [ ] inputs.self.nixosConfigurations;
exprs = lib.strings.concatStringsSep " or " jobs;
in
''
- name: Scraping
rules:
- alert: JobDown
expr: >
${exprs}
for: 10m
labels:
severity: critical
annotations:
summary: "Scrape job {{ $labels.job }} on {{ $labels.instance }} is down"
''

View file

@ -2,6 +2,7 @@
inputs,
config,
pkgs,
lib,
...
}:
let
@ -126,6 +127,7 @@ in
${builtins.readFile ./alerts/postfix.yaml}
${builtins.readFile ./alerts/postgres.yaml}
${builtins.readFile ./alerts/systemd.yaml}
${import ./alerts/job_up.nix { inherit inputs lib; }}
''
];
};