Add some more alerting and fail2ban rules
All checks were successful
/ dev-shell (push) Successful in 33s
/ rust-packages (push) Successful in 38s
/ terraform-providers (push) Successful in 1m1s
/ check (push) Successful in 2m7s
/ systems (push) Successful in 3m52s

This commit is contained in:
Kaare Hoff Skovgaard 2025-07-22 15:17:17 +02:00
parent e28f501316
commit 8e21df1764
Signed by: khs
GPG key ID: C7D890804F01E9F0
8 changed files with 214 additions and 25 deletions

View file

@ -70,6 +70,9 @@ in
DEFAULT = {
APP_NAME = "KAS: Codes";
};
metrics = {
ENABLED = true;
};
server = rec {
DOMAIN = "kas.codes";
ROOT_URL = "https://${DOMAIN}";
@ -189,6 +192,9 @@ in
locations."/" = {
proxyPass = "http://localhost:3000";
};
locations."/metrics" = {
return = "404";
};
};
};
};
@ -199,4 +205,19 @@ in
useDefaultShell = true;
};
users.groups.git = { };
environment.etc."alloy/forgejo_prometheus.alloy" = {
text = ''
prometheus.scrape "forgejo_exporter" {
scrape_interval = "1m"
targets = [
{
"__address__" = "127.0.0.1:${toString config.services.forgejo.settings.server.HTTP_PORT}",
},
]
metrics_path = "/metrics"
forward_to = [otelcol.receiver.prometheus.default.receiver]
}
'';
};
}

View file

@ -10,6 +10,68 @@
capabilities = [ "read" ];
};
};
khscodes.infrastructure.hetzner-instance.extraFirewallRules = [
{
direction = "out";
protocol = "tcp";
port = 25;
destination_ips = [
"0.0.0.0/0"
"::/0"
];
description = "smtp";
}
{
direction = "out";
protocol = "tcp";
port = 80;
destination_ips = [
"0.0.0.0/0"
"::/0"
];
description = "http";
}
{
direction = "out";
protocol = "tcp";
port = 443;
destination_ips = [
"0.0.0.0/0"
"::/0"
];
description = "http";
}
{
direction = "out";
protocol = "udp";
port = 443;
destination_ips = [
"0.0.0.0/0"
"::/0"
];
description = "quic";
}
{
direction = "out";
protocol = "udp";
port = 53;
destination_ips = [
"0.0.0.0/0"
"::/0"
];
description = "dns";
}
{
direction = "out";
protocol = "tcp";
port = 53;
destination_ips = [
"0.0.0.0/0"
"::/0"
];
description = "dns";
}
];
khscodes.infrastructure.provisioning.pre.modules = [
{
khscodes.vault = {
@ -25,6 +87,34 @@
};
}
];
services.prometheus.exporters.postfix = {
enable = true;
};
khscodes.infrastructure.vault-prometheus-sender.exporters.enabled = [ "postfix" ];
services.fail2ban.jails = {
postfix-sasl = {
settings = {
filter = "postfix[mode=auth]";
port = "smtp,submission,imap,imaps,pop3,pop3s";
findtime = 600;
maxretry = 5;
};
};
postfix = {
settings = {
enabled = true;
findtime = 600;
maxretry = 3;
};
};
dovecot = {
settings = {
enabled = true;
findtime = 600;
maxretry = 3;
};
};
};
mailserver = {
enable = true;
fqdn = "kas.codes";
@ -37,7 +127,7 @@
};
certificateScheme = "acme";
dkimKeyDirectory = "/var/lib/vault-agent/mailserver/dkim/";
dkimSelector = "dkim_rsa";
dkimSelector = "snm_rsa";
# Not sure we need to set this at all.
dkimKeyBits = 2048;
};

View file

@ -1,3 +1,4 @@
{ pkgs, lib, ... }:
let
publicKeyBegin = ''"-----BEGIN PUBLIC KEY-----\n"'';
publicKeyEnd = ''"-----END PUBLIC KEY-----\n"'';
@ -13,10 +14,19 @@ in
{{ .Data.data.dkim_private_key }}
{{- end -}}
'';
destination = "/var/lib/vault-agent/mailserver/dkim/rsa_private.key";
destination = "/var/lib/vault-agent/mailserver/dkim/kas.codes.snm_rsa.key";
perms = "0600";
owner = "rspamd";
group = "rspamd";
exec = lib.getExe (
pkgs.writeShellApplication {
name = "kas.codes.snm_rsa-remove-txt";
runtimeInputs = [ pkgs.uutils-coreutils-noprefix ];
text = ''
rm -f /var/lib/vault-agent/mailserver/dkim/kas.codes.snm_rsa.txt
'';
}
);
restartUnits = [
"rspamd.service"
"postfix.service"
@ -28,10 +38,19 @@ in
{{ .Data.data.dkim_private_key }}
{{- end -}}
'';
destination = "/var/lib/vault-agent/mailserver/dkim/ed25519_private.key";
destination = "/var/lib/vault-agent/mailserver/dkim/kas.codes.snm_ed25519.key";
perms = "0600";
owner = "rspamd";
group = "rspamd";
exec = lib.getExe (
pkgs.writeShellApplication {
name = "kas.codes.snm_rsa-remove-txt";
runtimeInputs = [ pkgs.uutils-coreutils-noprefix ];
text = ''
rm -f /var/lib/vault-agent/mailserver/dkim/kas.codes.snm_ed25519.txt
'';
}
);
restartUnits = [
"rspamd.service"
"postfix.service"

View file

@ -1,21 +1,20 @@
groups:
- name: Http
rules:
- alert: NginxDown
expr: >
nginx_up{job="nginx"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: "Nginx on {{ $labels.instance }} is down"
- alert: HighHttpErrorRate
expr: >
sum by(vhost, instance) (rate(http_response_count_total{status=~"5..",job="nginxlog"}[1m])) /
sum by(vhost, instance) (rate(http_response_count_total{job="nginxlog"}[1m]))
> 0
for: 30m
labels:
severity: critical
annotations:
summary: "Nginx VHost {{ $labels.vhost }} on {{ $labels.instance }} is generating many internal server errors over 1 hour"
- name: Http
rules:
- alert: NginxDown
expr: >
nginx_up{job="nginx"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: "Nginx on {{ $labels.instance }} is down"
- alert: HighHttpErrorRate
expr: >
sum by(vhost, instance) (rate(http_response_count_total{status=~"5..",job="nginxlog"}[1m])) /
sum by(vhost, instance) (rate(http_response_count_total{job="nginxlog"}[1m]))
> 0
for: 30m
labels:
severity: critical
annotations:
summary: "Nginx VHost {{ $labels.vhost }} on {{ $labels.instance }} is generating many internal server errors over 1 hour"

View file

@ -0,0 +1,18 @@
- name: Postfix
rules:
- alert: PostfixDown
expr: >
postfix_up{job="postfix"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: "Postfix on {{ $labels.instance }} is down"
- alert: PostfixQueueNotEmptying
expr: >
postfix_showq_message_size_bytes_sum{job="postfix"} > 0
for: 10m
labels:
severity: critical
annotations:
summary: "Postfix queue {{ $labels.queue }} on {{ $labels.instance }} has been non-empty over 10m"

View file

@ -0,0 +1,18 @@
- name: Postgres
rules:
- alert: PgDown
expr: >
pg_up{job="postgres"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: "Postgres on {{ $labels.instance }} is down"
- alert: PgScrapeError
expr: >
pg_exporter_last_scrape_error{job="postgres"} > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Could not scrape postgres on {{ $labels.instance }}"

View file

@ -0,0 +1,10 @@
- name: Systemd
rules:
- alert: UnitFailed
expr: >
node_systemd_unit_state{job="integrations/node_exporter",state="failed"} == 1
for: 10m
labels:
severity: warn
annotations:
summary: "Unit {{ $labels.name}} on {{ $labels.instance }} is in failed state"

View file

@ -13,6 +13,10 @@ let
url = "https://grafana.com/api/dashboards/9628/revisions/8/download";
hash = "sha256-UhusNAZbyt7fJV/DhFUK4FKOmnTpG0R15YO2r+nDnMc=";
};
postfixDashboard = pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/10013/revisions/2/download";
hash = "sha256-SIKL1V+sJ5F7vPOwp/LuOjrGm8nCsscEX8LcLFMotfc=";
};
in
{
imports = [
@ -102,6 +106,10 @@ in
name = "Postgresql";
options.path = postgresqlDashboard;
}
{
name = "Postfix";
options.path = postfixDashboard;
}
];
};
};
@ -111,7 +119,13 @@ in
extraFlags = [ "--web.enable-otlp-receiver" ];
# alertmanager.enable = true;
rules = [
(builtins.readFile ./alerts/http.yaml)
''
groups:
${builtins.readFile ./alerts/http.yaml}
${builtins.readFile ./alerts/postfix.yaml}
${builtins.readFile ./alerts/postgres.yaml}
${builtins.readFile ./alerts/systemd.yaml}
''
];
};
services.loki = {