Monitoring: Add scrape down alerts
This commit is contained in:
parent
af583fcbd4
commit
28f4b34cd1
5 changed files with 78 additions and 32 deletions
|
@ -19,10 +19,17 @@ in
|
||||||
description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform";
|
description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform";
|
||||||
default = "prometheus-mtls";
|
default = "prometheus-mtls";
|
||||||
};
|
};
|
||||||
exporters.enabled = lib.mkOption {
|
exporters = {
|
||||||
type = lib.types.listOf lib.types.str;
|
enabled = lib.mkOption {
|
||||||
default = [ ];
|
type = lib.types.listOf lib.types.str;
|
||||||
description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings.";
|
default = [ ];
|
||||||
|
description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings.";
|
||||||
|
};
|
||||||
|
external = lib.mkOption {
|
||||||
|
type = lib.types.listOf lib.types.str;
|
||||||
|
default = [ "node_exporter" ];
|
||||||
|
description = "Externally managed exporters. This should be the name of the job of that exporter. Adding it to this list will get monitoring of when the exporter goes down working";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -82,35 +89,34 @@ in
|
||||||
PROMETHEUS_CLIENT_CERT = client_cert;
|
PROMETHEUS_CLIENT_CERT = client_cert;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
environment.etc =
|
environment.etc = {
|
||||||
{
|
"alloy/prometheus.alloy" = {
|
||||||
"alloy/prometheus.alloy" = {
|
source = ./prometheus.alloy;
|
||||||
source = ./prometheus.alloy;
|
};
|
||||||
};
|
}
|
||||||
}
|
// lib.listToAttrs (
|
||||||
// lib.listToAttrs (
|
lib.lists.map (
|
||||||
lib.lists.map (
|
name:
|
||||||
name:
|
let
|
||||||
let
|
value = config.services.prometheus.exporters.${name};
|
||||||
value = config.services.prometheus.exporters.${name};
|
in
|
||||||
in
|
{
|
||||||
{
|
name = "alloy/prometheus_${name}.alloy";
|
||||||
name = "alloy/prometheus_${name}.alloy";
|
value = {
|
||||||
value = {
|
text = ''
|
||||||
text = ''
|
prometheus.scrape "exporter_${name}" {
|
||||||
prometheus.scrape "exporter_${name}" {
|
targets = [
|
||||||
targets = [
|
{"__address__" = "127.0.0.1:${toString value.port}", "instance" = constants.hostname, "job" = "${name}"},
|
||||||
{"__address__" = "127.0.0.1:${toString value.port}", "instance" = constants.hostname, "job" = "${name}"},
|
]
|
||||||
]
|
|
||||||
|
|
||||||
scrape_interval = "1m"
|
scrape_interval = "1m"
|
||||||
|
|
||||||
forward_to = [otelcol.receiver.prometheus.default.receiver]
|
forward_to = [otelcol.receiver.prometheus.default.receiver]
|
||||||
}
|
}
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
) cfg.exporters.enabled
|
) cfg.exporters.enabled
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -181,6 +181,7 @@ in
|
||||||
};
|
};
|
||||||
users.groups.git = { };
|
users.groups.git = { };
|
||||||
|
|
||||||
|
khscodes.infrastructure.vault-prometheus-sender.exporters.external = [ "forgejo" ];
|
||||||
environment.etc."alloy/forgejo_prometheus.alloy" = {
|
environment.etc."alloy/forgejo_prometheus.alloy" = {
|
||||||
text = ''
|
text = ''
|
||||||
prometheus.scrape "forgejo_exporter" {
|
prometheus.scrape "forgejo_exporter" {
|
||||||
|
|
|
@ -0,0 +1,37 @@
|
||||||
|
{ inputs, lib }:
|
||||||
|
let
|
||||||
|
jobs = lib.attrsets.foldlAttrs (
|
||||||
|
acc: name: nixos:
|
||||||
|
let
|
||||||
|
vault-prometheus-sender = nixos.config.khscodes.infrastructure.vault-prometheus-sender;
|
||||||
|
instance = nixos.config.khscodes.networking.fqdn;
|
||||||
|
instanceJobs =
|
||||||
|
vault-prometheus-sender.exporters.enabled ++ vault-prometheus-sender.exporters.external;
|
||||||
|
expr = lib.strings.concatMapStringsSep " or " (
|
||||||
|
job: ''absent_over_time(up{instance="${instance}", job="${job}"}[2m])''
|
||||||
|
) instanceJobs;
|
||||||
|
in
|
||||||
|
acc
|
||||||
|
++ (
|
||||||
|
if vault-prometheus-sender.enable then
|
||||||
|
[
|
||||||
|
expr
|
||||||
|
]
|
||||||
|
else
|
||||||
|
[ ]
|
||||||
|
)
|
||||||
|
) [ ] inputs.self.nixosConfigurations;
|
||||||
|
exprs = lib.strings.concatStringsSep " or " jobs;
|
||||||
|
in
|
||||||
|
''
|
||||||
|
- name: Scraping
|
||||||
|
rules:
|
||||||
|
- alert: JobDown
|
||||||
|
expr: >
|
||||||
|
${exprs}
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Scrape job {{ $labels.job }} on {{ $labels.instance }} is down"
|
||||||
|
''
|
|
@ -2,6 +2,7 @@
|
||||||
inputs,
|
inputs,
|
||||||
config,
|
config,
|
||||||
pkgs,
|
pkgs,
|
||||||
|
lib,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
let
|
let
|
||||||
|
@ -126,6 +127,7 @@ in
|
||||||
${builtins.readFile ./alerts/postfix.yaml}
|
${builtins.readFile ./alerts/postfix.yaml}
|
||||||
${builtins.readFile ./alerts/postgres.yaml}
|
${builtins.readFile ./alerts/postgres.yaml}
|
||||||
${builtins.readFile ./alerts/systemd.yaml}
|
${builtins.readFile ./alerts/systemd.yaml}
|
||||||
|
${import ./alerts/job_up.nix { inherit inputs lib; }}
|
||||||
''
|
''
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue