Monitoring: Add scrape down alerts
This commit is contained in:
parent
af583fcbd4
commit
28f4b34cd1
5 changed files with 78 additions and 32 deletions
|
@ -19,11 +19,18 @@ in
|
|||
description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform";
|
||||
default = "prometheus-mtls";
|
||||
};
|
||||
exporters.enabled = lib.mkOption {
|
||||
exporters = {
|
||||
enabled = lib.mkOption {
|
||||
type = lib.types.listOf lib.types.str;
|
||||
default = [ ];
|
||||
description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings.";
|
||||
};
|
||||
external = lib.mkOption {
|
||||
type = lib.types.listOf lib.types.str;
|
||||
default = [ "node_exporter" ];
|
||||
description = "Externally managed exporters. This should be the name of the job of that exporter. Adding it to this list will get monitoring of when the exporter goes down working";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkIf cfg.enable {
|
||||
|
@ -82,8 +89,7 @@ in
|
|||
PROMETHEUS_CLIENT_CERT = client_cert;
|
||||
};
|
||||
};
|
||||
environment.etc =
|
||||
{
|
||||
environment.etc = {
|
||||
"alloy/prometheus.alloy" = {
|
||||
source = ./prometheus.alloy;
|
||||
};
|
||||
|
|
|
@ -181,6 +181,7 @@ in
|
|||
};
|
||||
users.groups.git = { };
|
||||
|
||||
khscodes.infrastructure.vault-prometheus-sender.exporters.external = [ "forgejo" ];
|
||||
environment.etc."alloy/forgejo_prometheus.alloy" = {
|
||||
text = ''
|
||||
prometheus.scrape "forgejo_exporter" {
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
{ inputs, lib }:
|
||||
let
|
||||
jobs = lib.attrsets.foldlAttrs (
|
||||
acc: name: nixos:
|
||||
let
|
||||
vault-prometheus-sender = nixos.config.khscodes.infrastructure.vault-prometheus-sender;
|
||||
instance = nixos.config.khscodes.networking.fqdn;
|
||||
instanceJobs =
|
||||
vault-prometheus-sender.exporters.enabled ++ vault-prometheus-sender.exporters.external;
|
||||
expr = lib.strings.concatMapStringsSep " or " (
|
||||
job: ''absent_over_time(up{instance="${instance}", job="${job}"}[2m])''
|
||||
) instanceJobs;
|
||||
in
|
||||
acc
|
||||
++ (
|
||||
if vault-prometheus-sender.enable then
|
||||
[
|
||||
expr
|
||||
]
|
||||
else
|
||||
[ ]
|
||||
)
|
||||
) [ ] inputs.self.nixosConfigurations;
|
||||
exprs = lib.strings.concatStringsSep " or " jobs;
|
||||
in
|
||||
''
|
||||
- name: Scraping
|
||||
rules:
|
||||
- alert: JobDown
|
||||
expr: >
|
||||
${exprs}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Scrape job {{ $labels.job }} on {{ $labels.instance }} is down"
|
||||
''
|
|
@ -2,6 +2,7 @@
|
|||
inputs,
|
||||
config,
|
||||
pkgs,
|
||||
lib,
|
||||
...
|
||||
}:
|
||||
let
|
||||
|
@ -126,6 +127,7 @@ in
|
|||
${builtins.readFile ./alerts/postfix.yaml}
|
||||
${builtins.readFile ./alerts/postgres.yaml}
|
||||
${builtins.readFile ./alerts/systemd.yaml}
|
||||
${import ./alerts/job_up.nix { inherit inputs lib; }}
|
||||
''
|
||||
];
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue