Begin adding some alerts and add postgres exporter
Some checks failed
/ dev-shell (push) Successful in 2m31s
/ rust-packages (push) Successful in 6m57s
/ check (push) Failing after 11m24s
/ terraform-providers (push) Successful in 14m45s
/ systems (push) Successful in 34m47s

This commit is contained in:
Kaare Hoff Skovgaard 2025-07-22 00:01:13 +02:00
parent 0233431feb
commit e28f501316
Signed by: khs
GPG key ID: C7D890804F01E9F0
7 changed files with 301 additions and 62 deletions

View file

@ -11,6 +11,7 @@ let
client_cert = "/var/lib/alloy/prometheus.cert"; client_cert = "/var/lib/alloy/prometheus.cert";
in in
{ {
imports = [ ./postgres.nix ];
options.khscodes.infrastructure.vault-prometheus-sender = { options.khscodes.infrastructure.vault-prometheus-sender = {
enable = lib.mkEnableOption "Configures the server approle to allow sending data to prometheus"; enable = lib.mkEnableOption "Configures the server approle to allow sending data to prometheus";
terranixBackendName = lib.mkOption { terranixBackendName = lib.mkOption {
@ -18,6 +19,11 @@ in
description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform"; description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform";
default = "prometheus-mtls"; default = "prometheus-mtls";
}; };
exporters.enabled = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ ];
description = "List of config.services.prometheus.exporters.<name> that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings.";
};
}; };
config = lib.mkIf cfg.enable { config = lib.mkIf cfg.enable {
@ -76,8 +82,35 @@ in
PROMETHEUS_CLIENT_CERT = client_cert; PROMETHEUS_CLIENT_CERT = client_cert;
}; };
}; };
environment.etc."alloy/prometheus.alloy" = { environment.etc =
{
"alloy/prometheus.alloy" = {
source = ./prometheus.alloy; source = ./prometheus.alloy;
}; };
}
// lib.listToAttrs (
lib.lists.map (
name:
let
value = config.services.prometheus.exporters.${name};
in
{
name = "alloy/prometheus_${name}.alloy";
value = {
text = ''
prometheus.scrape "exporter_${name}" {
targets = [
{"__address__" = "127.0.0.1:${toString value.port}", "instance" = constants.hostname, "job" = "${name}"},
]
scrape_interval = "1m"
forward_to = [otelcol.receiver.prometheus.default.receiver]
}
'';
};
}
) cfg.exporters.enabled
);
}; };
} }

View file

@ -0,0 +1,86 @@
{
lib,
config,
pkgs,
...
}:
let
sql = pkgs.writeText "create-postgres-exporter.sql" ''
-- To use IF statements, hence to be able to check if the user exists before
-- attempting creation, we need to switch to procedural SQL (PL/pgSQL)
-- instead of standard SQL.
-- More: https://www.postgresql.org/docs/9.3/plpgsql-overview.html
-- To preserve compatibility with <9.0, DO blocks are not used; instead,
-- a function is created and dropped.
CREATE OR REPLACE FUNCTION __tmp_create_user() returns void as $$
BEGIN
IF NOT EXISTS (
SELECT -- SELECT list can stay empty for this
FROM pg_catalog.pg_user
WHERE usename = 'postgres_exporter') THEN
CREATE USER postgres_exporter;
END IF;
END;
$$ language plpgsql;
SELECT __tmp_create_user();
DROP FUNCTION __tmp_create_user();
ALTER USER postgres_exporter SET SEARCH_PATH TO postgres_exporter,pg_catalog;
-- If deploying as non-superuser (for example in AWS RDS), uncomment the GRANT
-- line below and replace <MASTER_USER> with your root user.
-- GRANT postgres_exporter TO <MASTER_USER>;
GRANT CONNECT ON DATABASE postgres TO postgres_exporter;
GRANT pg_monitor to postgres_exporter;
'';
script = pkgs.writeShellApplication {
name = "postgresql-create-postgres-exporter";
runtimeInputs = [ config.services.postgresql.package ];
text = ''
PSQL="psql --port=${builtins.toString config.services.postgresql.settings.port}"
while ! $PSQL -d postgres -c "" 2> /dev/null; do
sleep 0.1
done
$PSQL -tAf ${sql}
'';
};
in
{
config =
lib.mkIf
(config.khscodes.infrastructure.vault-prometheus-sender.enable && config.services.postgresql.enable)
{
khscodes.infrastructure.vault-prometheus-sender.exporters.enabled = [ "postgres" ];
services.prometheus.exporters.postgres = {
enable = true;
dataSourceName = "user=postgres_exporter host=/run/postgresql database=postgres sslmode=disable";
user = "postgres_exporter";
group = "postgres_exporter";
};
users.users.postgres_exporter = {
enable = true;
group = "postgres_exporter";
isSystemUser = true;
};
users.groups.postgres_exporter = { };
systemd.services.postgresql-create-postgres-exporter = {
enable = true;
after = [ "postgresql.service" ];
requires = [ "postgresql.service" ];
wantedBy = [ "multi-user.target" ];
before = [ "prometheus-postgres-exporter.service" ];
serviceConfig = {
Type = "oneshot";
ExecStart = [
(lib.getExe script)
];
User = "postgres";
Group = "postgres";
};
};
};
}

View file

@ -5,13 +5,44 @@
modulesPath, modulesPath,
... ...
}: }:
# TODO: Enable and configure prometheus-nginx-exporter and prometheus-nginxlog-exporter
# to get some metrics into prometheus.
let let
cfg = config.khscodes.services.nginx; cfg = config.khscodes.services.nginx;
locationOptions = import "${modulesPath}/services/web-servers/nginx/location-options.nix" { locationOptions = import "${modulesPath}/services/web-servers/nginx/location-options.nix" {
inherit lib config; inherit lib config;
}; };
logfmt = ''
log_format logfmt escape=default
'ts="$time_iso8601" http_host="$http_host" request_method="$request_method" '
'uri="$request_uri" ' # full path
'status=$status ' # response status
'remote_addr="$remote_addr" remote_port="$remote_port" ' # remote ip and port (port should be dynamic)
'http_user_agent="$http_user_agent" '
'http_referer="$http_referer" '
'http_x_forwarded_for="$http_x_forwarded_for" '
'server_name="$server_name" ' # the name of the vhost serving the request, may be different from http_host
'scheme="$scheme" ' # http/https
'body_bytes_sent=$body_bytes_sent ' # the number of body bytes exclude headers sent to a client
'bytes_sent=$bytes_sent ' # the number of bytes sent to a client
'ssl_protocol="$ssl_protocol" ' # TLS protocol
'ssl_cipher="$ssl_cipher" ' # TLS cipher
'server_protocol="$server_protocol" ' # request protocol, like HTTP/1.1 or HTTP/2.0
'remote_user="$remote_user" ' # http user name (should be unused for this setup)
'pid="$pid" ' # process pid
'connection="$connection" ' # connection serial number
'connection_requests="$connection_requests" ' # number of requests made in connection
'request_id="$request_id" ' # unique id for the request
'request_length=$request_length ' # request length (including headers and body)
'gzip_ratio="$gzip_ratio" '
'request_time=$request_time ' # request processing time in seconds with msec resolutio
'upstream="$upstream_addr" ' # upstream backend server for proxied requests
'upstream_connect_time=$upstream_connect_time ' # upstream handshake time incl. TLS
'upstream_header_time=$upstream_header_time ' # time spent receiving upstream headers
'upstream_response_time=$upstream_response_time ' # time spent receiving upstream body
'upstream_response_length=$upstream_response_length ' # upstream response length
'upstream_cache_status="$upstream_cache_status" ' # cache HIT/MISS where applicable
'msec=$msec' # unix timestamp with millis in fraction
;
'';
vhostOption = lib.khscodes.mkSubmodule { vhostOption = lib.khscodes.mkSubmodule {
description = "nginx vhost"; description = "nginx vhost";
options = { options = {
@ -133,12 +164,59 @@ in
message = "Cannot use `config.khscodes.services.nginx.virtualHosts.<name>.acme = {}` without setting config.khscodes.security.acme.dns01Enabled"; message = "Cannot use `config.khscodes.services.nginx.virtualHosts.<name>.acme = {}` without setting config.khscodes.security.acme.dns01Enabled";
} }
]; ];
services.prometheus.exporters = {
nginx = {
enable = config.khscodes.infrastructure.vault-prometheus-sender.enable;
};
nginxlog = {
enable = config.khscodes.infrastructure.vault-prometheus-sender.enable;
settings = {
namespaces = lib.lists.map (vhost: {
name = vhost;
format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\" $upstream_response_time";
metrics_override = {
prefix = "";
};
namespace_label = "vhost";
source = {
files = [ "/var/log/nginx/access.${vhost}.log" ];
};
histogram_buckets = [
0.005
0.01
0.025
0.05
0.1
0.25
0.5
1
2.5
5
10
];
}) (lib.attrsets.attrNames cfg.virtualHosts);
};
};
};
systemd.services.prometheus-nginxlog-exporter.serviceConfig.DynamicUser = false;
khscodes.infrastructure.vault-prometheus-sender.exporters.enabled = [
"nginx"
"nginxlog"
];
users = {
users.${config.services.prometheus.exporters.nginxlog.user} = {
group = config.services.prometheus.exporters.nginxlog.group;
extraGroups = [ config.services.nginx.group ];
isSystemUser = true;
};
groups.${config.services.prometheus.exporters.nginxlog.user} = { };
};
services.fail2ban.jails = { services.fail2ban.jails = {
nginx-botsearch = { nginx-botsearch = {
settings = { settings = {
filter = "nginx-botsearch"; filter = "nginx-botsearch";
port = "http,https"; port = "http,https";
logpath = "/var/log/nginx/access.log"; logpath = "/var/log/nginx/access.fail2ban.log";
backend = "auto"; backend = "auto";
findtime = 600; findtime = 600;
maxretry = 5; maxretry = 5;
@ -148,10 +226,10 @@ in
settings = { settings = {
filter = "nginx-bad-request"; filter = "nginx-bad-request";
port = "http,https"; port = "http,https";
logpath = "/var/log/nginx/access.log"; logpath = "/var/log/nginx/access.fail2ban.log";
backend = "auto"; backend = "auto";
findtime = 600; findtime = 600;
maxretry = 2; maxretry = 30;
}; };
}; };
nginx-req-limit = { nginx-req-limit = {
@ -160,7 +238,7 @@ in
port = "http,https"; port = "http,https";
backend = "systemd"; backend = "systemd";
findtime = 600; findtime = 600;
maxretry = 3; maxretry = 5;
}; };
}; };
}; };
@ -190,6 +268,18 @@ in
recommendedOptimisation = lib.mkDefault true; recommendedOptimisation = lib.mkDefault true;
recommendedZstdSettings = lib.mkDefault true; recommendedZstdSettings = lib.mkDefault true;
recommendedProxySettings = lib.mkDefault true; recommendedProxySettings = lib.mkDefault true;
commonHttpConfig = ''
${logfmt}
access_log /var/log/nginx/access.logfmt.log logfmt;
log_format fail2ban '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
log_format nginx_exporter '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" $upstream_response_time';
access_log /var/log/nginx/access.fail2ban.log fail2ban;
'';
appendHttpConfig = '' appendHttpConfig = ''
limit_req_zone $binary_remote_addr zone=nobots:10m rate=20r/s; limit_req_zone $binary_remote_addr zone=nobots:10m rate=20r/s;
map $scheme $hsts_header { map $scheme $hsts_header {
@ -210,52 +300,6 @@ in
"" "Unknown"; "" "Unknown";
} }
log_format json_analytics escape=json '{'
'"msec": "$msec", ' # request unixtime in seconds with a milliseconds resolution
'"connection": "$connection", ' # connection serial number
'"connection_requests": "$connection_requests", ' # number of requests made in connection
'"pid": "$pid", ' # process pid
'"request_id": "$request_id", ' # the unique request id
'"request_length": "$request_length", ' # request length (including headers and body)
'"remote_addr": "$remote_addr", ' # client IP
'"remote_user": "$remote_user", ' # client HTTP username
'"remote_port": "$remote_port", ' # client port
'"time_local": "$time_local", '
'"time_iso8601": "$time_iso8601", ' # local time in the ISO 8601 standard format
'"request": "$request", ' # full path no arguments if the request
'"request_uri": "$request_uri", ' # full path and arguments if the request
'"args": "$args", ' # args
'"status": "$status", ' # response status code
'"body_bytes_sent": "$body_bytes_sent", ' # the number of body bytes exclude headers sent to a client
'"bytes_sent": "$bytes_sent", ' # the number of bytes sent to a client
'"http_referer": "$http_referer", ' # HTTP referer
'"http_user_agent": "$http_user_agent", ' # user agent
'"http_x_forwarded_for": "$http_x_forwarded_for", ' # http_x_forwarded_for
'"http_host": "$http_host", ' # the request Host: header
'"server_name": "$server_name", ' # the name of the vhost serving the request
'"request_time": "$request_time", ' # request processing time in seconds with msec resolution
'"upstream": "$upstream_addr", ' # upstream backend server for proxied requests
'"upstream_connect_time": "$upstream_connect_time", ' # upstream handshake time incl. TLS
'"upstream_header_time": "$upstream_header_time", ' # time spent receiving upstream headers
'"upstream_response_time": "$upstream_response_time", ' # time spent receiving upstream body
'"upstream_response_length": "$upstream_response_length", ' # upstream response length
'"upstream_cache_status": "$upstream_cache_status", ' # cache HIT/MISS where applicable
'"ssl_protocol": "$ssl_protocol", ' # TLS protocol
'"ssl_cipher": "$ssl_cipher", ' # TLS cipher
'"scheme": "$scheme", ' # http or https
'"request_method": "$request_method", ' # request method
'"server_protocol": "$server_protocol", ' # request protocol, like HTTP/1.1 or HTTP/2.0
'"pipe": "$pipe", ' # "p" if request was pipelined, "." otherwise
'"gzip_ratio": "$gzip_ratio"'
'}';
access_log /var/log/nginx/access.json.log json_analytics;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /var/log/nginx/access.log main;
${modernSslAppendedHttpConfig} ${modernSslAppendedHttpConfig}
''; '';
virtualHosts = lib.attrsets.mapAttrs ( virtualHosts = lib.attrsets.mapAttrs (
@ -275,6 +319,9 @@ in
extraConfig = '' extraConfig = ''
${mtls} ${mtls}
${reqLimit} ${reqLimit}
access_log /var/log/nginx/access.fail2ban.log fail2ban;
access_log /var/log/nginx/access.logfmt.log logfmt;
access_log /var/log/nginx/access.${name}.log nginx_exporter;
${value.extraConfig} ${value.extraConfig}
''; '';
in in

View file

@ -12,21 +12,33 @@ loki_send "nginx_stream_error" {
} }
loki.source.file "nginx_access_logs" { loki.source.file "nginx_access_logs" {
targets = [{ targets = [{
__path__ = "/var/log/nginx/access.json.log", __path__ = "/var/log/nginx/access.logfmt.log",
}] }]
forward_to = [loki.process.nginx_access_logs.receiver] forward_to = [loki.process.nginx_access_logs.receiver]
} }
loki.process "nginx_access_logs" { loki.process "nginx_access_logs" {
forward_to = [loki_send.nginx_access.receiver] forward_to = [loki_send.nginx_access.receiver]
stage.json { stage.logfmt {
expressions = { mapping = {
timestamp = "time_iso8601", timestamp = "msec",
http_host = "http_host",
level = "status",
} }
} }
stage.timestamp { stage.timestamp {
source = "timestamp" source = "timestamp"
format = "RFC3339" format = "Unix"
}
stage.template {
source = "level"
template = "{{$v := atoi .Value}}{{if gt $v 499}}error{{else if gt $v 399}}warn{{ else if lt $v 200}}debug{{else}}info{{end}}"
}
stage.structured_metadata {
values = {
level = "",
http_host = "",
}
} }
} }

View file

@ -0,0 +1,21 @@
groups:
- name: Http
rules:
- alert: NginxDown
expr: >
nginx_up{job="nginx"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: "Nginx on {{ $labels.instance }} is down"
- alert: HighHttpErrorRate
expr: >
sum by(vhost, instance) (rate(http_response_count_total{status=~"5..",job="nginxlog"}[1m])) /
sum by(vhost, instance) (rate(http_response_count_total{job="nginxlog"}[1m]))
> 0
for: 30m
labels:
severity: critical
annotations:
summary: "Nginx VHost {{ $labels.vhost }} on {{ $labels.instance }} is generating many internal server errors over 1 hour"

View file

@ -1,12 +1,18 @@
{ {
inputs, inputs,
config, config,
pkgs,
... ...
}: }:
let let
grafana = config.services.grafana; grafana = config.services.grafana;
loki = config.services.loki; loki = config.services.loki;
prometheus = config.services.prometheus; prometheus = config.services.prometheus;
nginxExporterSrc = "${pkgs.prometheus-nginx-exporter.src}/grafana/dashboard.json";
postgresqlDashboard = pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/9628/revisions/8/download";
hash = "sha256-UhusNAZbyt7fJV/DhFUK4FKOmnTpG0R15YO2r+nDnMc=";
};
in in
{ {
imports = [ imports = [
@ -54,16 +60,33 @@ in
}; };
provision = { provision = {
enable = true; enable = true;
alerting = {
rules = {
settings = {
deleteRules = [
{
uid = "desmw56u3jfgga";
orgId = 1;
}
];
};
};
};
datasources.settings.datasources = [ datasources.settings.datasources = [
{ {
url = "http://${loki.configuration.server.http_listen_address}:${toString loki.configuration.server.http_listen_port}"; url = "http://${loki.configuration.server.http_listen_address}:${toString loki.configuration.server.http_listen_port}";
type = "loki"; type = "loki";
name = "Logs"; name = "Logs";
uid = "loki";
} }
{ {
url = "http://${prometheus.listenAddress}:${toString prometheus.port}"; url = "http://${prometheus.listenAddress}:${toString prometheus.port}";
type = "prometheus"; type = "prometheus";
name = "Metrics"; name = "Metrics";
uid = "prometheus";
jsonData = {
manageAlerts = true;
};
} }
]; ];
dashboards.settings.providers = [ dashboards.settings.providers = [
@ -71,6 +94,14 @@ in
name = "Node Exporter"; name = "Node Exporter";
options.path = ./grafana/dashboards/node_exporter; options.path = ./grafana/dashboards/node_exporter;
} }
{
name = "Nginx";
options.path = nginxExporterSrc;
}
{
name = "Postgresql";
options.path = postgresqlDashboard;
}
]; ];
}; };
}; };
@ -78,6 +109,10 @@ in
enable = true; enable = true;
listenAddress = "127.0.0.1"; listenAddress = "127.0.0.1";
extraFlags = [ "--web.enable-otlp-receiver" ]; extraFlags = [ "--web.enable-otlp-receiver" ];
# alertmanager.enable = true;
rules = [
(builtins.readFile ./alerts/http.yaml)
];
}; };
services.loki = { services.loki = {
enable = true; enable = true;
@ -130,16 +165,21 @@ in
delete_request_store = "filesystem"; delete_request_store = "filesystem";
working_directory = "${config.services.loki.dataDir}/retention"; working_directory = "${config.services.loki.dataDir}/retention";
}; };
limits_config = {
allow_structured_metadata = true;
discover_log_levels = true;
};
}; };
}; };
khscodes = { khscodes = {
infrastructure.khs-openstack-instance = { infrastructure.khs-openstack-instance = {
enable = true; enable = true;
flavor = "m.medium"; flavor = "m.large";
}; };
services.nginx = { services.nginx = {
enable = true; enable = true;
virtualHosts."monitoring.kaareskovgaard.net" = { virtualHosts."monitoring.kaareskovgaard.net" = {
rateLimit.enable = false;
locations."/" = { locations."/" = {
proxyPass = "http://${grafana.settings.server.http_addr}:${toString grafana.settings.server.http_port}"; proxyPass = "http://${grafana.settings.server.http_addr}:${toString grafana.settings.server.http_port}";
proxyWebsockets = true; proxyWebsockets = true;