From e28f5013164e8928709590c75d5b1da70ac7f378 Mon Sep 17 00:00:00 2001 From: Kaare Hoff Skovgaard Date: Tue, 22 Jul 2025 00:01:13 +0200 Subject: [PATCH] Begin adding some alerts and add postgres exporter --- .../vault-prometheus-sender/default.nix | 39 ++++- .../vault-prometheus-sender/postgres.nix | 86 ++++++++++ .../vault-prometheus-sender/prometheus.alloy | 2 +- nix/modules/nixos/services/nginx/default.nix | 151 ++++++++++++------ nix/modules/nixos/services/nginx/nginx.alloy | 22 ++- .../alerts/http.yaml | 21 +++ .../monitoring.kaareskovgaard.net/default.nix | 42 ++++- 7 files changed, 301 insertions(+), 62 deletions(-) create mode 100644 nix/modules/nixos/infrastructure/vault-prometheus-sender/postgres.nix create mode 100644 nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml diff --git a/nix/modules/nixos/infrastructure/vault-prometheus-sender/default.nix b/nix/modules/nixos/infrastructure/vault-prometheus-sender/default.nix index 1b76ee3..faf09d0 100644 --- a/nix/modules/nixos/infrastructure/vault-prometheus-sender/default.nix +++ b/nix/modules/nixos/infrastructure/vault-prometheus-sender/default.nix @@ -11,6 +11,7 @@ let client_cert = "/var/lib/alloy/prometheus.cert"; in { + imports = [ ./postgres.nix ]; options.khscodes.infrastructure.vault-prometheus-sender = { enable = lib.mkEnableOption "Configures the server approle to allow sending data to prometheus"; terranixBackendName = lib.mkOption { @@ -18,6 +19,11 @@ in description = "This should only be configured for the server hosting vault, to allow setting up dependencies in terraform"; default = "prometheus-mtls"; }; + exporters.enabled = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + description = "List of config.services.prometheus.exporters. that are enabled. This is not done automatically as I don't know how to do that without triggering removed options warnings."; + }; }; config = lib.mkIf cfg.enable { @@ -76,8 +82,35 @@ in PROMETHEUS_CLIENT_CERT = client_cert; }; }; - environment.etc."alloy/prometheus.alloy" = { - source = ./prometheus.alloy; - }; + environment.etc = + { + "alloy/prometheus.alloy" = { + source = ./prometheus.alloy; + }; + } + // lib.listToAttrs ( + lib.lists.map ( + name: + let + value = config.services.prometheus.exporters.${name}; + in + { + name = "alloy/prometheus_${name}.alloy"; + value = { + text = '' + prometheus.scrape "exporter_${name}" { + targets = [ + {"__address__" = "127.0.0.1:${toString value.port}", "instance" = constants.hostname, "job" = "${name}"}, + ] + + scrape_interval = "1m" + + forward_to = [otelcol.receiver.prometheus.default.receiver] + } + ''; + }; + } + ) cfg.exporters.enabled + ); }; } diff --git a/nix/modules/nixos/infrastructure/vault-prometheus-sender/postgres.nix b/nix/modules/nixos/infrastructure/vault-prometheus-sender/postgres.nix new file mode 100644 index 0000000..a02c9cb --- /dev/null +++ b/nix/modules/nixos/infrastructure/vault-prometheus-sender/postgres.nix @@ -0,0 +1,86 @@ +{ + lib, + config, + pkgs, + ... +}: +let + sql = pkgs.writeText "create-postgres-exporter.sql" '' + -- To use IF statements, hence to be able to check if the user exists before + -- attempting creation, we need to switch to procedural SQL (PL/pgSQL) + -- instead of standard SQL. + -- More: https://www.postgresql.org/docs/9.3/plpgsql-overview.html + -- To preserve compatibility with <9.0, DO blocks are not used; instead, + -- a function is created and dropped. + CREATE OR REPLACE FUNCTION __tmp_create_user() returns void as $$ + BEGIN + IF NOT EXISTS ( + SELECT -- SELECT list can stay empty for this + FROM pg_catalog.pg_user + WHERE usename = 'postgres_exporter') THEN + CREATE USER postgres_exporter; + END IF; + END; + $$ language plpgsql; + + SELECT __tmp_create_user(); + DROP FUNCTION __tmp_create_user(); + + ALTER USER postgres_exporter SET SEARCH_PATH TO postgres_exporter,pg_catalog; + + -- If deploying as non-superuser (for example in AWS RDS), uncomment the GRANT + -- line below and replace with your root user. + -- GRANT postgres_exporter TO ; + + GRANT CONNECT ON DATABASE postgres TO postgres_exporter; + GRANT pg_monitor to postgres_exporter; + ''; + script = pkgs.writeShellApplication { + name = "postgresql-create-postgres-exporter"; + runtimeInputs = [ config.services.postgresql.package ]; + text = '' + PSQL="psql --port=${builtins.toString config.services.postgresql.settings.port}" + + while ! $PSQL -d postgres -c "" 2> /dev/null; do + sleep 0.1 + done + + $PSQL -tAf ${sql} + ''; + }; +in +{ + config = + lib.mkIf + (config.khscodes.infrastructure.vault-prometheus-sender.enable && config.services.postgresql.enable) + { + khscodes.infrastructure.vault-prometheus-sender.exporters.enabled = [ "postgres" ]; + services.prometheus.exporters.postgres = { + enable = true; + dataSourceName = "user=postgres_exporter host=/run/postgresql database=postgres sslmode=disable"; + user = "postgres_exporter"; + group = "postgres_exporter"; + }; + users.users.postgres_exporter = { + enable = true; + group = "postgres_exporter"; + isSystemUser = true; + }; + users.groups.postgres_exporter = { }; + systemd.services.postgresql-create-postgres-exporter = { + enable = true; + after = [ "postgresql.service" ]; + requires = [ "postgresql.service" ]; + wantedBy = [ "multi-user.target" ]; + before = [ "prometheus-postgres-exporter.service" ]; + serviceConfig = { + Type = "oneshot"; + ExecStart = [ + (lib.getExe script) + ]; + User = "postgres"; + Group = "postgres"; + }; + }; + }; +} diff --git a/nix/modules/nixos/infrastructure/vault-prometheus-sender/prometheus.alloy b/nix/modules/nixos/infrastructure/vault-prometheus-sender/prometheus.alloy index 1ee6dff..d84ece5 100644 --- a/nix/modules/nixos/infrastructure/vault-prometheus-sender/prometheus.alloy +++ b/nix/modules/nixos/infrastructure/vault-prometheus-sender/prometheus.alloy @@ -50,7 +50,7 @@ prometheus.exporter.unix "integrations_node_exporter" { // Define how to scrape metrics from the node_exporter prometheus.scrape "integrations_node_exporter" { -scrape_interval = "1m" + scrape_interval = "1m" // Use the targets with labels from the discovery.relabel component targets = discovery.relabel.integrations_node_exporter.output // Send the scraped metrics to the relabeling component diff --git a/nix/modules/nixos/services/nginx/default.nix b/nix/modules/nixos/services/nginx/default.nix index 5b97825..6d4607c 100644 --- a/nix/modules/nixos/services/nginx/default.nix +++ b/nix/modules/nixos/services/nginx/default.nix @@ -5,13 +5,44 @@ modulesPath, ... }: -# TODO: Enable and configure prometheus-nginx-exporter and prometheus-nginxlog-exporter -# to get some metrics into prometheus. let cfg = config.khscodes.services.nginx; locationOptions = import "${modulesPath}/services/web-servers/nginx/location-options.nix" { inherit lib config; }; + logfmt = '' + log_format logfmt escape=default + 'ts="$time_iso8601" http_host="$http_host" request_method="$request_method" ' + 'uri="$request_uri" ' # full path + 'status=$status ' # response status + 'remote_addr="$remote_addr" remote_port="$remote_port" ' # remote ip and port (port should be dynamic) + 'http_user_agent="$http_user_agent" ' + 'http_referer="$http_referer" ' + 'http_x_forwarded_for="$http_x_forwarded_for" ' + 'server_name="$server_name" ' # the name of the vhost serving the request, may be different from http_host + 'scheme="$scheme" ' # http/https + 'body_bytes_sent=$body_bytes_sent ' # the number of body bytes exclude headers sent to a client + 'bytes_sent=$bytes_sent ' # the number of bytes sent to a client + 'ssl_protocol="$ssl_protocol" ' # TLS protocol + 'ssl_cipher="$ssl_cipher" ' # TLS cipher + 'server_protocol="$server_protocol" ' # request protocol, like HTTP/1.1 or HTTP/2.0 + 'remote_user="$remote_user" ' # http user name (should be unused for this setup) + 'pid="$pid" ' # process pid + 'connection="$connection" ' # connection serial number + 'connection_requests="$connection_requests" ' # number of requests made in connection + 'request_id="$request_id" ' # unique id for the request + 'request_length=$request_length ' # request length (including headers and body) + 'gzip_ratio="$gzip_ratio" ' + 'request_time=$request_time ' # request processing time in seconds with msec resolutio + 'upstream="$upstream_addr" ' # upstream backend server for proxied requests + 'upstream_connect_time=$upstream_connect_time ' # upstream handshake time incl. TLS + 'upstream_header_time=$upstream_header_time ' # time spent receiving upstream headers + 'upstream_response_time=$upstream_response_time ' # time spent receiving upstream body + 'upstream_response_length=$upstream_response_length ' # upstream response length + 'upstream_cache_status="$upstream_cache_status" ' # cache HIT/MISS where applicable + 'msec=$msec' # unix timestamp with millis in fraction + ; + ''; vhostOption = lib.khscodes.mkSubmodule { description = "nginx vhost"; options = { @@ -133,12 +164,59 @@ in message = "Cannot use `config.khscodes.services.nginx.virtualHosts..acme = {}` without setting config.khscodes.security.acme.dns01Enabled"; } ]; + services.prometheus.exporters = { + nginx = { + enable = config.khscodes.infrastructure.vault-prometheus-sender.enable; + }; + nginxlog = { + enable = config.khscodes.infrastructure.vault-prometheus-sender.enable; + settings = { + namespaces = lib.lists.map (vhost: { + name = vhost; + format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\" $upstream_response_time"; + metrics_override = { + prefix = ""; + }; + namespace_label = "vhost"; + source = { + files = [ "/var/log/nginx/access.${vhost}.log" ]; + }; + histogram_buckets = [ + 0.005 + 0.01 + 0.025 + 0.05 + 0.1 + 0.25 + 0.5 + 1 + 2.5 + 5 + 10 + ]; + }) (lib.attrsets.attrNames cfg.virtualHosts); + }; + }; + }; + systemd.services.prometheus-nginxlog-exporter.serviceConfig.DynamicUser = false; + khscodes.infrastructure.vault-prometheus-sender.exporters.enabled = [ + "nginx" + "nginxlog" + ]; + users = { + users.${config.services.prometheus.exporters.nginxlog.user} = { + group = config.services.prometheus.exporters.nginxlog.group; + extraGroups = [ config.services.nginx.group ]; + isSystemUser = true; + }; + groups.${config.services.prometheus.exporters.nginxlog.user} = { }; + }; services.fail2ban.jails = { nginx-botsearch = { settings = { filter = "nginx-botsearch"; port = "http,https"; - logpath = "/var/log/nginx/access.log"; + logpath = "/var/log/nginx/access.fail2ban.log"; backend = "auto"; findtime = 600; maxretry = 5; @@ -148,10 +226,10 @@ in settings = { filter = "nginx-bad-request"; port = "http,https"; - logpath = "/var/log/nginx/access.log"; + logpath = "/var/log/nginx/access.fail2ban.log"; backend = "auto"; findtime = 600; - maxretry = 2; + maxretry = 30; }; }; nginx-req-limit = { @@ -160,7 +238,7 @@ in port = "http,https"; backend = "systemd"; findtime = 600; - maxretry = 3; + maxretry = 5; }; }; }; @@ -190,6 +268,18 @@ in recommendedOptimisation = lib.mkDefault true; recommendedZstdSettings = lib.mkDefault true; recommendedProxySettings = lib.mkDefault true; + commonHttpConfig = '' + ${logfmt} + access_log /var/log/nginx/access.logfmt.log logfmt; + + log_format fail2ban '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + log_format nginx_exporter '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for" $upstream_response_time'; + access_log /var/log/nginx/access.fail2ban.log fail2ban; + ''; appendHttpConfig = '' limit_req_zone $binary_remote_addr zone=nobots:10m rate=20r/s; map $scheme $hsts_header { @@ -210,52 +300,6 @@ in "" "Unknown"; } - log_format json_analytics escape=json '{' - '"msec": "$msec", ' # request unixtime in seconds with a milliseconds resolution - '"connection": "$connection", ' # connection serial number - '"connection_requests": "$connection_requests", ' # number of requests made in connection - '"pid": "$pid", ' # process pid - '"request_id": "$request_id", ' # the unique request id - '"request_length": "$request_length", ' # request length (including headers and body) - '"remote_addr": "$remote_addr", ' # client IP - '"remote_user": "$remote_user", ' # client HTTP username - '"remote_port": "$remote_port", ' # client port - '"time_local": "$time_local", ' - '"time_iso8601": "$time_iso8601", ' # local time in the ISO 8601 standard format - '"request": "$request", ' # full path no arguments if the request - '"request_uri": "$request_uri", ' # full path and arguments if the request - '"args": "$args", ' # args - '"status": "$status", ' # response status code - '"body_bytes_sent": "$body_bytes_sent", ' # the number of body bytes exclude headers sent to a client - '"bytes_sent": "$bytes_sent", ' # the number of bytes sent to a client - '"http_referer": "$http_referer", ' # HTTP referer - '"http_user_agent": "$http_user_agent", ' # user agent - '"http_x_forwarded_for": "$http_x_forwarded_for", ' # http_x_forwarded_for - '"http_host": "$http_host", ' # the request Host: header - '"server_name": "$server_name", ' # the name of the vhost serving the request - '"request_time": "$request_time", ' # request processing time in seconds with msec resolution - '"upstream": "$upstream_addr", ' # upstream backend server for proxied requests - '"upstream_connect_time": "$upstream_connect_time", ' # upstream handshake time incl. TLS - '"upstream_header_time": "$upstream_header_time", ' # time spent receiving upstream headers - '"upstream_response_time": "$upstream_response_time", ' # time spent receiving upstream body - '"upstream_response_length": "$upstream_response_length", ' # upstream response length - '"upstream_cache_status": "$upstream_cache_status", ' # cache HIT/MISS where applicable - '"ssl_protocol": "$ssl_protocol", ' # TLS protocol - '"ssl_cipher": "$ssl_cipher", ' # TLS cipher - '"scheme": "$scheme", ' # http or https - '"request_method": "$request_method", ' # request method - '"server_protocol": "$server_protocol", ' # request protocol, like HTTP/1.1 or HTTP/2.0 - '"pipe": "$pipe", ' # "p" if request was pipelined, "." otherwise - '"gzip_ratio": "$gzip_ratio"' - '}'; - - access_log /var/log/nginx/access.json.log json_analytics; - - log_format main '$remote_addr - $remote_user [$time_local] "$request" ' - '$status $body_bytes_sent "$http_referer" ' - '"$http_user_agent" "$http_x_forwarded_for"'; - access_log /var/log/nginx/access.log main; - ${modernSslAppendedHttpConfig} ''; virtualHosts = lib.attrsets.mapAttrs ( @@ -275,6 +319,9 @@ in extraConfig = '' ${mtls} ${reqLimit} + access_log /var/log/nginx/access.fail2ban.log fail2ban; + access_log /var/log/nginx/access.logfmt.log logfmt; + access_log /var/log/nginx/access.${name}.log nginx_exporter; ${value.extraConfig} ''; in diff --git a/nix/modules/nixos/services/nginx/nginx.alloy b/nix/modules/nixos/services/nginx/nginx.alloy index f60e649..346fedb 100644 --- a/nix/modules/nixos/services/nginx/nginx.alloy +++ b/nix/modules/nixos/services/nginx/nginx.alloy @@ -12,21 +12,33 @@ loki_send "nginx_stream_error" { } loki.source.file "nginx_access_logs" { targets = [{ - __path__ = "/var/log/nginx/access.json.log", + __path__ = "/var/log/nginx/access.logfmt.log", }] forward_to = [loki.process.nginx_access_logs.receiver] } loki.process "nginx_access_logs" { forward_to = [loki_send.nginx_access.receiver] - stage.json { - expressions = { - timestamp = "time_iso8601", + stage.logfmt { + mapping = { + timestamp = "msec", + http_host = "http_host", + level = "status", } } stage.timestamp { source = "timestamp" - format = "RFC3339" + format = "Unix" + } + stage.template { + source = "level" + template = "{{$v := atoi .Value}}{{if gt $v 499}}error{{else if gt $v 399}}warn{{ else if lt $v 200}}debug{{else}}info{{end}}" + } + stage.structured_metadata { + values = { + level = "", + http_host = "", + } } } diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml new file mode 100644 index 0000000..279f264 --- /dev/null +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/alerts/http.yaml @@ -0,0 +1,21 @@ +groups: + - name: Http + rules: + - alert: NginxDown + expr: > + nginx_up{job="nginx"} == 0 + for: 10m + labels: + severity: critical + annotations: + summary: "Nginx on {{ $labels.instance }} is down" + - alert: HighHttpErrorRate + expr: > + sum by(vhost, instance) (rate(http_response_count_total{status=~"5..",job="nginxlog"}[1m])) / + sum by(vhost, instance) (rate(http_response_count_total{job="nginxlog"}[1m])) + > 0 + for: 30m + labels: + severity: critical + annotations: + summary: "Nginx VHost {{ $labels.vhost }} on {{ $labels.instance }} is generating many internal server errors over 1 hour" diff --git a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix index 37e82d5..ccd0ca3 100644 --- a/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix +++ b/nix/systems/x86_64-linux/monitoring.kaareskovgaard.net/default.nix @@ -1,12 +1,18 @@ { inputs, config, + pkgs, ... }: let grafana = config.services.grafana; loki = config.services.loki; prometheus = config.services.prometheus; + nginxExporterSrc = "${pkgs.prometheus-nginx-exporter.src}/grafana/dashboard.json"; + postgresqlDashboard = pkgs.fetchurl { + url = "https://grafana.com/api/dashboards/9628/revisions/8/download"; + hash = "sha256-UhusNAZbyt7fJV/DhFUK4FKOmnTpG0R15YO2r+nDnMc="; + }; in { imports = [ @@ -54,16 +60,33 @@ in }; provision = { enable = true; + alerting = { + rules = { + settings = { + deleteRules = [ + { + uid = "desmw56u3jfgga"; + orgId = 1; + } + ]; + }; + }; + }; datasources.settings.datasources = [ { url = "http://${loki.configuration.server.http_listen_address}:${toString loki.configuration.server.http_listen_port}"; type = "loki"; name = "Logs"; + uid = "loki"; } { url = "http://${prometheus.listenAddress}:${toString prometheus.port}"; type = "prometheus"; name = "Metrics"; + uid = "prometheus"; + jsonData = { + manageAlerts = true; + }; } ]; dashboards.settings.providers = [ @@ -71,6 +94,14 @@ in name = "Node Exporter"; options.path = ./grafana/dashboards/node_exporter; } + { + name = "Nginx"; + options.path = nginxExporterSrc; + } + { + name = "Postgresql"; + options.path = postgresqlDashboard; + } ]; }; }; @@ -78,6 +109,10 @@ in enable = true; listenAddress = "127.0.0.1"; extraFlags = [ "--web.enable-otlp-receiver" ]; + # alertmanager.enable = true; + rules = [ + (builtins.readFile ./alerts/http.yaml) + ]; }; services.loki = { enable = true; @@ -130,16 +165,21 @@ in delete_request_store = "filesystem"; working_directory = "${config.services.loki.dataDir}/retention"; }; + limits_config = { + allow_structured_metadata = true; + discover_log_levels = true; + }; }; }; khscodes = { infrastructure.khs-openstack-instance = { enable = true; - flavor = "m.medium"; + flavor = "m.large"; }; services.nginx = { enable = true; virtualHosts."monitoring.kaareskovgaard.net" = { + rateLimit.enable = false; locations."/" = { proxyPass = "http://${grafana.settings.server.http_addr}:${toString grafana.settings.server.http_port}"; proxyWebsockets = true;