From 7ed796f12ccb0f5c41e725621f26afce2e7c5172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Gruszczy=C5=84ski?= Date: Fri, 26 Jun 2026 11:01:55 +0200 Subject: [PATCH] first commit --- README.md | 83 ++++ config.example.json | 19 + varnish_exporter.py | 1092 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1194 insertions(+) create mode 100644 README.md create mode 100644 config.example.json create mode 100755 varnish_exporter.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e81427d --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +# Varnish Prometheus Business Exporter + +Lightweight Python exporter for Varnish. It exposes `/metrics` for Prometheus and focuses on per-domain business/operational statistics rather than only raw counters. + +## Requirements + +- Python 3 +- `varnishstat` +- `varnishlog` +- Access to Varnish shared memory/logs, usually by running as root or with proper permissions + +No external Python dependencies are required. + +## Quick start + +Only varnishstat: + +```bash +python3 varnish_exporter.py --modules core,stat +``` + +Test VSL without sampling: + +```bash +sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full +``` + +Recommended production mode: + +```bash +sudo python3 varnish_exporter.py \ + --port 9131 \ + --enable-vsl \ + --profile standard \ + --vsl-sample 0.001 +``` + +Then check: + +```bash +curl -s http://127.0.0.1:9131/metrics | grep varnish_domain +``` + +## Profiles + +- `minimal`: rps, hit ratio, backend ratio, 5xx ratio, p95 latency +- `standard`: recommended business-oriented domain statistics +- `full`: more detailed derived statistics +- `raw`: full plus raw HTTP counters/histograms + +## Modules + +- `core`: exporter self metrics +- `stat`: `varnishstat -1 -j` +- `vsl`: `varnishlog -g request` +- `domain`: derived per-domain statistics +- `raw`: raw request counters/histograms + +Example: + +```bash +sudo python3 varnish_exporter.py --modules core,stat,vsl,domain --profile standard +``` + +## Domain grouping + +Use `config.example.json` as a template: + +```bash +sudo python3 varnish_exporter.py \ + --config ./config.example.json \ + --enable-vsl \ + --profile standard +``` + +If no config is provided, the exporter uses the real Host header as the `site` label. + +## Important notes + +- Varnish cannot measure real browser render time such as LCP/FCP/DOMContentLoaded. +- Latency here means server-side response time as observed by Varnish. +- For high traffic, avoid `--vsl-sample 1` in production. +- Start production with `--vsl-sample 0.001` or `0.0001`. diff --git a/config.example.json b/config.example.json new file mode 100644 index 0000000..2b71541 --- /dev/null +++ b/config.example.json @@ -0,0 +1,19 @@ +{ + "site_rules": [ + { + "match": "(^|\\.)example\\.com$", + "site": "example_com" + }, + { + "match": "(^|\\.)static\\.example\\.com$", + "site": "static_example_com" + }, + { + "match": "(^|\\.)api\\.example\\.com$", + "site": "api_example_com" + } + ], + "default_site": "other", + "allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"], + "histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10] +} diff --git a/varnish_exporter.py b/varnish_exporter.py new file mode 100755 index 0000000..fb0260a --- /dev/null +++ b/varnish_exporter.py @@ -0,0 +1,1092 @@ +#!/usr/bin/env python3 +""" +Varnish Prometheus Business Exporter + +Purpose: + A lightweight Prometheus exporter for Varnish focused not only on raw counters, + but also on ready-to-use business and operational statistics per domain/site. + +Data sources: + 1. varnishstat -1 -j + - global Varnish metrics + - storage, backends, cache hits/misses, workers, locks, etc. + + 2. varnishlog -g request + - sampled request stream + - per-domain aggregation + - hit/miss/pass/backend ratios + - latency p50/p90/p95/p99 + - error ratios + - RPS + - saved backend RPS + +Main domain metrics: + varnish_domain_rps + varnish_domain_hit_ratio + varnish_domain_miss_ratio + varnish_domain_pass_ratio + varnish_domain_backend_ratio + varnish_domain_cache_efficiency_ratio + varnish_domain_saved_backend_rps + varnish_domain_error_ratio + varnish_domain_4xx_ratio + varnish_domain_5xx_ratio + varnish_domain_avg_latency_seconds + varnish_domain_p50_latency_seconds + varnish_domain_p90_latency_seconds + varnish_domain_p95_latency_seconds + varnish_domain_p99_latency_seconds + varnish_domain_slow_100ms_ratio + varnish_domain_slow_250ms_ratio + varnish_domain_slow_500ms_ratio + varnish_domain_slow_1s_ratio + +Profiles: + minimal: + - exporter health + - varnishstat + - per-domain: rps, hit_ratio, backend_ratio, 5xx_ratio, p95 + + standard: + - minimal + + - miss/pass ratio + - cache efficiency + - backend rps + - saved backend rps + - error ratio + - avg/p50/p90/p99 latency + - slow ratios + + full: + - standard + + - per-cache average latency + - 2xx/3xx/4xx/5xx rps/ratio + - pipe/synth/unknown ratios + + raw: + - full + + - raw varnish_http_requests_total + - raw varnish_http_request_duration_seconds + - varnish_domain_response_time_seconds histogram + +Modules: + core: + - exporter self metrics + + stat: + - varnishstat metrics + + vsl: + - varnishlog collector + + domain: + - derived per-domain metrics + + raw: + - raw HTTP request counters/histograms + +Defaults: + --modules core,stat,vsl,domain + --profile standard + --vsl-sample 0.001 + +Examples: + + Local test without sampling: + sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full + + Production: + sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard + + Only varnishstat: + python3 varnish_exporter.py --modules core,stat + + Only domain aggregates, no raw HTTP metrics: + sudo python3 varnish_exporter.py --modules core,stat,vsl,domain --profile standard + + Debug with raw metrics: + sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1 + +JSON config example: + { + "site_rules": [ + { + "match": "(^|\\\\.)example\\\\.com$", + "site": "example_com" + }, + { + "match": "(^|\\\\.)static\\\\.example\\\\.com$", + "site": "static_example_com" + } + ], + "default_site": "other", + "allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"], + "histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10] + } + +Notes: + - Varnish cannot see real browser render time. + - Latency here means server-side response time as seen by Varnish. + - For very high traffic, do not use --vsl-sample 1 in production. + - Sensible production values: 0.001 or 0.0001. +""" + +import argparse +import json +import random +import re +import subprocess +import threading +import time +from collections import defaultdict +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + + +DEFAULT_CONFIG = { + "site_rules": [], + "default_site": "other", + "allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"], + "histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], +} + + +VALID_MODULES = {"core", "stat", "vsl", "domain", "raw"} + + +def load_config(path): + if not path: + cfg = dict(DEFAULT_CONFIG) + else: + with open(path, "r", encoding="utf-8") as f: + cfg = json.load(f) + + cfg.setdefault("site_rules", []) + cfg.setdefault("default_site", "other") + cfg.setdefault("allowed_methods", DEFAULT_CONFIG["allowed_methods"]) + cfg.setdefault("histogram_buckets", DEFAULT_CONFIG["histogram_buckets"]) + + cfg["site_rules_compiled"] = [ + (re.compile(x["match"], re.I), x["site"]) + for x in cfg.get("site_rules", []) + ] + cfg["allowed_methods_set"] = set(cfg["allowed_methods"]) + return cfg + + +def parse_modules(value): + modules = set() + + for item in value.split(","): + item = item.strip().lower() + + if not item: + continue + + if item == "all": + return set(VALID_MODULES) + + if item not in VALID_MODULES: + raise argparse.ArgumentTypeError( + f"unknown module: {item}; available modules: {','.join(sorted(VALID_MODULES))}" + ) + + modules.add(item) + + if not modules: + raise argparse.ArgumentTypeError("module list cannot be empty") + + return modules + + +def prom_escape(value): + return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + + +def prom_name(name): + name = str(name).lower() + name = re.sub(r"[^a-z0-9_]", "_", name) + name = re.sub(r"_+", "_", name).strip("_") + if name and name[0].isdigit(): + name = "_" + name + return name or "unknown" + + +def status_class(status): + try: + code = int(status) + except Exception: + return "unknown" + + if 100 <= code <= 599: + return f"{code // 100}xx" + + return "unknown" + + +def normalize_method(method, cfg): + method = str(method or "UNKNOWN").upper() + return method if method in cfg["allowed_methods_set"] else "OTHER" + + +def normalize_host(host): + host = str(host or "").lower().strip().rstrip(".") + + if not host: + return "" + + if host.startswith("[") and "]" in host: + return host[1:host.index("]")] + + if ":" in host: + host = host.split(":", 1)[0] + + return host + + +def site_group(host, cfg): + host = normalize_host(host) + + if not host: + return cfg.get("default_site", "other") + + for regex, site in cfg["site_rules_compiled"]: + if regex.search(host): + return site + + return host + + +def format_metric(name, labels, value): + if labels: + label_str = ",".join( + f'{k}="{prom_escape(v)}"' + for k, v in sorted(labels) + ) + return f"{name}{{{label_str}}} {value}" + + return f"{name} {value}" + + +def estimate_quantile_from_buckets(buckets, count, q): + if count <= 0: + return 0.0 + + finite = sorted(b for b in buckets.keys() if b != float("inf")) + + if not finite: + return 0.0 + + target = count * q + prev_le = 0.0 + prev_count = 0.0 + + for le in finite: + current_count = buckets.get(le, 0.0) + + if current_count >= target: + if current_count <= prev_count: + return le + + fraction = (target - prev_count) / (current_count - prev_count) + return prev_le + ((le - prev_le) * fraction) + + prev_le = le + prev_count = current_count + + return finite[-1] + + +class Metrics: + def __init__(self, buckets, max_series, window_seconds, bucket_seconds, profile, modules): + self.lock = threading.RLock() + self.buckets = buckets + self.max_series = max_series + self.window_seconds = window_seconds + self.bucket_seconds = bucket_seconds + self.profile = profile + self.modules = modules + + self.gauges = {} + self.counters = {} + self.histograms = defaultdict(lambda: { + "buckets": defaultdict(float), + "sum": 0.0, + "count": 0.0, + }) + + self.series_seen = set() + self.dropped_series = 0 + + self.domain_window = defaultdict(lambda: defaultdict(float)) + + def module_enabled(self, name): + return name in self.modules + + def emit_raw(self): + return self.profile == "raw" or self.module_enabled("raw") + + def emit_minimal(self): + return self.profile in ("minimal", "standard", "full", "raw") + + def emit_standard(self): + return self.profile in ("standard", "full", "raw") + + def emit_full(self): + return self.profile in ("full", "raw") + + def _label_key(self, labels): + return tuple(sorted((str(k), str(v)) for k, v in labels.items())) + + def _allow_series(self, metric, labels): + key = (metric, labels) + + if key in self.series_seen: + return True + + if len(self.series_seen) >= self.max_series: + self.dropped_series += 1 + return False + + self.series_seen.add(key) + return True + + def set_gauge(self, name, value, **labels): + labels_key = self._label_key(labels) + + with self.lock: + if not self._allow_series(name, labels_key): + return + self.gauges[(name, labels_key)] = float(value) + + def set_counter(self, name, value, **labels): + if not name.endswith("_total"): + name += "_total" + + labels_key = self._label_key(labels) + + with self.lock: + if not self._allow_series(name, labels_key): + return + self.counters[(name, labels_key)] = float(value) + + def inc_counter(self, name, amount=1.0, **labels): + if not name.endswith("_total"): + name += "_total" + + labels_key = self._label_key(labels) + + with self.lock: + if not self._allow_series(name, labels_key): + return + self.counters[(name, labels_key)] = self.counters.get((name, labels_key), 0.0) + float(amount) + + def observe(self, name, value, weight=1.0, **labels): + labels_key = self._label_key(labels) + + with self.lock: + if not self._allow_series(name, labels_key): + return + + h = self.histograms[(name, labels_key)] + h["sum"] += float(value) * float(weight) + h["count"] += float(weight) + + for b in self.buckets: + if value <= b: + h["buckets"][b] += float(weight) + + h["buckets"][float("inf")] += float(weight) + + def record_http(self, site, method, status_class_value, cache, latency, weight): + raw_labels = { + "site": site, + "method": method, + "status_class": status_class_value, + "cache": cache, + } + + if self.emit_raw(): + self.inc_counter("varnish_http_requests_total", weight, **raw_labels) + + if latency is not None: + self.observe( + "varnish_http_request_duration_seconds", + latency, + weight=weight, + **raw_labels, + ) + + self.observe( + "varnish_domain_response_time_seconds", + latency, + weight=weight, + site=site, + ) + + if not self.module_enabled("domain"): + return + + now = int(time.time()) + bucket_ts = now - (now % self.bucket_seconds) + key = (bucket_ts, site) + + with self.lock: + b = self.domain_window[key] + + b["total"] += weight + b[f"cache_{cache}"] += weight + + if cache in ("hit", "miss"): + b["cacheable"] += weight + + if cache in ("miss", "pass"): + b["backend"] += weight + + if status_class_value == "2xx": + b["2xx"] += weight + elif status_class_value == "3xx": + b["3xx"] += weight + elif status_class_value == "4xx": + b["4xx"] += weight + b["errors"] += weight + elif status_class_value == "5xx": + b["5xx"] += weight + b["errors"] += weight + + if latency is not None: + b["latency_sum"] += latency * weight + b["latency_count"] += weight + b[f"cache_{cache}_latency_sum"] += latency * weight + b[f"cache_{cache}_latency_count"] += weight + + if latency > 0.05: + b["slow_50ms"] += weight + if latency > 0.1: + b["slow_100ms"] += weight + if latency > 0.25: + b["slow_250ms"] += weight + if latency > 0.5: + b["slow_500ms"] += weight + if latency > 1.0: + b["slow_1s"] += weight + if latency > 2.5: + b["slow_2500ms"] += weight + if latency > 5.0: + b["slow_5s"] += weight + + for le in self.buckets: + if latency <= le: + b[f"latency_le_{le}"] += weight + + b["latency_le_inf"] += weight + + self.cleanup_window_locked(now) + + def cleanup_window_locked(self, now): + min_ts = now - self.window_seconds - self.bucket_seconds + + for key in list(self.domain_window.keys()): + bucket_ts, _site = key + + if bucket_ts < min_ts: + del self.domain_window[key] + + def calculate_domain_window_stats(self): + if not self.module_enabled("domain"): + return {} + + now = int(time.time()) + min_ts = now - self.window_seconds + + stats = defaultdict(lambda: defaultdict(float)) + + with self.lock: + self.cleanup_window_locked(now) + items = list(self.domain_window.items()) + + for (bucket_ts, site), values in items: + if bucket_ts < min_ts: + continue + + s = stats[site] + + for k, v in values.items(): + s[k] += float(v) + + out = {} + + for site, s in stats.items(): + total = s.get("total", 0.0) + + if total <= 0: + continue + + labels = (("site", site),) + window = float(self.window_seconds) + + hit = s.get("cache_hit", 0.0) + miss = s.get("cache_miss", 0.0) + passed = s.get("cache_pass", 0.0) + pipe = s.get("cache_pipe", 0.0) + synth = s.get("cache_synth", 0.0) + unknown_cache = s.get("cache_unknown", 0.0) + + backend = s.get("backend", 0.0) + cacheable = s.get("cacheable", 0.0) + + e2xx = s.get("2xx", 0.0) + e3xx = s.get("3xx", 0.0) + e4xx = s.get("4xx", 0.0) + e5xx = s.get("5xx", 0.0) + errors = s.get("errors", 0.0) + + out[("varnish_domain_rps", labels)] = total / window + out[("varnish_domain_hit_ratio", labels)] = hit / total + out[("varnish_domain_backend_ratio", labels)] = backend / total + out[("varnish_domain_5xx_ratio", labels)] = e5xx / total + + latency_count = s.get("latency_count", 0.0) + + latency_buckets = None + + if latency_count > 0: + latency_buckets = {} + + for le in self.buckets: + latency_buckets[le] = s.get(f"latency_le_{le}", 0.0) + + latency_buckets[float("inf")] = s.get("latency_le_inf", 0.0) + + out[("varnish_domain_p95_latency_seconds", labels)] = estimate_quantile_from_buckets( + latency_buckets, + latency_count, + 0.95, + ) + + if self.emit_standard(): + out[("varnish_domain_requests_per_second", labels)] = total / window + out[("varnish_domain_hit_rps", labels)] = hit / window + out[("varnish_domain_miss_rps", labels)] = miss / window + out[("varnish_domain_pass_rps", labels)] = passed / window + out[("varnish_domain_backend_rps", labels)] = backend / window + out[("varnish_domain_saved_backend_rps", labels)] = hit / window + + out[("varnish_domain_miss_ratio", labels)] = miss / total + out[("varnish_domain_pass_ratio", labels)] = passed / total + out[("varnish_domain_backend_ratio", labels)] = backend / total + out[("varnish_domain_cacheable_ratio", labels)] = cacheable / total + out[("varnish_domain_4xx_ratio", labels)] = e4xx / total + out[("varnish_domain_error_ratio", labels)] = errors / total + + if cacheable > 0: + out[("varnish_domain_cache_efficiency_ratio", labels)] = hit / cacheable + else: + out[("varnish_domain_cache_efficiency_ratio", labels)] = 0.0 + + if latency_count > 0 and latency_buckets is not None: + out[("varnish_domain_avg_latency_seconds", labels)] = s.get("latency_sum", 0.0) / latency_count + out[("varnish_domain_latency_observed_ratio", labels)] = latency_count / total + + out[("varnish_domain_p50_latency_seconds", labels)] = estimate_quantile_from_buckets( + latency_buckets, + latency_count, + 0.50, + ) + out[("varnish_domain_p90_latency_seconds", labels)] = estimate_quantile_from_buckets( + latency_buckets, + latency_count, + 0.90, + ) + out[("varnish_domain_p99_latency_seconds", labels)] = estimate_quantile_from_buckets( + latency_buckets, + latency_count, + 0.99, + ) + + out[("varnish_domain_slow_100ms_ratio", labels)] = s.get("slow_100ms", 0.0) / latency_count + out[("varnish_domain_slow_250ms_ratio", labels)] = s.get("slow_250ms", 0.0) / latency_count + out[("varnish_domain_slow_500ms_ratio", labels)] = s.get("slow_500ms", 0.0) / latency_count + out[("varnish_domain_slow_1s_ratio", labels)] = s.get("slow_1s", 0.0) / latency_count + + if self.emit_full(): + out[("varnish_domain_pipe_rps", labels)] = pipe / window + out[("varnish_domain_synth_rps", labels)] = synth / window + out[("varnish_domain_unknown_cache_rps", labels)] = unknown_cache / window + + out[("varnish_domain_pipe_ratio", labels)] = pipe / total + out[("varnish_domain_synth_ratio", labels)] = synth / total + out[("varnish_domain_unknown_cache_ratio", labels)] = unknown_cache / total + + out[("varnish_domain_2xx_ratio", labels)] = e2xx / total + out[("varnish_domain_3xx_ratio", labels)] = e3xx / total + out[("varnish_domain_2xx_rps", labels)] = e2xx / window + out[("varnish_domain_3xx_rps", labels)] = e3xx / window + out[("varnish_domain_4xx_rps", labels)] = e4xx / window + out[("varnish_domain_5xx_rps", labels)] = e5xx / window + out[("varnish_domain_error_rps", labels)] = errors / window + + if latency_count > 0: + out[("varnish_domain_slow_50ms_ratio", labels)] = s.get("slow_50ms", 0.0) / latency_count + out[("varnish_domain_slow_2500ms_ratio", labels)] = s.get("slow_2500ms", 0.0) / latency_count + out[("varnish_domain_slow_5s_ratio", labels)] = s.get("slow_5s", 0.0) / latency_count + + for cache_name in ("hit", "miss", "pass", "pipe", "synth", "unknown"): + cache_latency_count = s.get(f"cache_{cache_name}_latency_count", 0.0) + cache_latency_sum = s.get(f"cache_{cache_name}_latency_sum", 0.0) + + if cache_latency_count > 0: + metric = f"varnish_domain_{cache_name}_avg_latency_seconds" + out[(metric, labels)] = cache_latency_sum / cache_latency_count + + return out + + def render(self): + with self.lock: + gauges = dict(self.gauges) + counters = dict(self.counters) + histograms = { + k: { + "buckets": dict(v["buckets"]), + "sum": v["sum"], + "count": v["count"], + } + for k, v in self.histograms.items() + } + series_count = len(self.series_seen) + dropped_series = self.dropped_series + + derived = self.calculate_domain_window_stats() + + for key, value in derived.items(): + gauges[key] = value + + out = [] + + if self.module_enabled("core"): + out.append("# TYPE varnish_exporter_series gauge") + out.append(f"varnish_exporter_series {series_count}") + + out.append("# TYPE varnish_exporter_dropped_series_total counter") + out.append(f"varnish_exporter_dropped_series_total {dropped_series}") + + out.append("# TYPE varnish_exporter_window_seconds gauge") + out.append(f"varnish_exporter_window_seconds {self.window_seconds}") + + out.append("# TYPE varnish_exporter_profile gauge") + out.append(f'varnish_exporter_profile{{profile="{self.profile}"}} 1') + + for module in sorted(self.modules): + out.append(f'varnish_exporter_module_enabled{{module="{module}"}} 1') + + typed = set() + + for (name, labels), value in sorted(gauges.items()): + if name not in typed: + out.append(f"# TYPE {name} gauge") + typed.add(name) + out.append(format_metric(name, labels, value)) + + for (name, labels), value in sorted(counters.items()): + if name not in typed: + out.append(f"# TYPE {name} counter") + typed.add(name) + out.append(format_metric(name, labels, value)) + + for (name, labels), h in sorted(histograms.items()): + if name not in typed: + out.append(f"# TYPE {name} histogram") + typed.add(name) + + base_labels = dict(labels) + + for b in self.buckets: + lb = dict(base_labels) + lb["le"] = str(b) + out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(b, 0.0))) + + lb = dict(base_labels) + lb["le"] = "+Inf" + out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(float("inf"), 0.0))) + out.append(format_metric(name + "_sum", labels, h["sum"])) + out.append(format_metric(name + "_count", labels, h["count"])) + + return "\n".join(out) + "\n" + + +class VarnishStatCollector(threading.Thread): + def __init__(self, metrics, interval, instance): + super().__init__(daemon=True) + self.metrics = metrics + self.interval = interval + self.instance = instance + + def run(self): + while True: + started = time.time() + + try: + self.collect() + self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishstat") + except Exception: + self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat") + + self.metrics.set_gauge( + "varnish_exporter_collector_duration_seconds", + time.time() - started, + collector="varnishstat", + ) + + time.sleep(self.interval) + + def collect(self): + cmd = ["varnishstat", "-1", "-j"] + + if self.instance: + cmd.extend(["-n", self.instance]) + + raw = subprocess.check_output(cmd, text=True, timeout=10) + data = json.loads(raw) + + for key, item in data.items(): + if not isinstance(item, dict) or "value" not in item: + continue + + value = item.get("value", 0) + flag = item.get("flag", "g") + metric, labels = self.metric_from_key(key) + + if flag == "c": + self.metrics.set_counter(metric, value, **labels) + else: + self.metrics.set_gauge(metric, value, **labels) + + def metric_from_key(self, key): + parts = key.split(".") + section = prom_name(parts[0]) + + if len(parts) == 2: + return f"varnish_{section}_{prom_name(parts[1])}", {} + + labeled_sections = { + "VBE": "backend", + "SMA": "storage", + "SMF": "storage", + "MSE": "storage", + "LCK": "lock", + } + + if parts[0] in labeled_sections and len(parts) >= 3: + label_name = labeled_sections[parts[0]] + object_name = ".".join(parts[1:-1]) + field = parts[-1] + return f"varnish_{section}_{prom_name(field)}", {label_name: object_name} + + return f"varnish_{section}_{prom_name('_'.join(parts[1:]))}", {} + + +class VarnishLogCollector(threading.Thread): + def __init__(self, metrics, cfg, instance, sample_rate): + super().__init__(daemon=True) + self.metrics = metrics + self.cfg = cfg + self.instance = instance + self.sample_rate = sample_rate + self.sample_weight = 1.0 / sample_rate if sample_rate > 0 else 0.0 + + def run(self): + while True: + try: + self.stream() + except Exception: + self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog") + time.sleep(2) + + def stream(self): + cmd = [ + "varnishlog", + "-g", "request", + "-i", "ReqMethod,ReqHeader,RespStatus,VCL_call,Timestamp,End", + ] + + if self.instance: + cmd.extend(["-n", self.instance]) + + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + bufsize=1, + ) + + self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishlog") + + tx = None + sampled = False + + for line in proc.stdout: + line = line.rstrip("\n") + + if "<< Request" in line: + if tx: + self.finish_tx(tx) + + sampled = random.random() < self.sample_rate + tx = {} if sampled else None + continue + + if not sampled or tx is None: + continue + + parsed = self.parse_line(line) + + if not parsed: + continue + + tag, value = parsed + + if tag == "End": + self.finish_tx(tx) + tx = None + sampled = False + continue + + if tag == "ReqMethod": + tx["method"] = value.split()[0] if value else "UNKNOWN" + + elif tag == "RespStatus": + tx["status"] = value.split()[0] if value else "0" + + elif tag == "ReqHeader": + low = value.lower() + + if low.startswith("host:"): + tx["host"] = value.split(":", 1)[1].strip() + + elif tag == "VCL_call": + cache = self.cache_state(value) + + if cache: + tx["cache"] = cache + + elif tag == "Timestamp": + name, latency = self.parse_timestamp(value) + + if latency is not None: + if name == "Resp": + tx["response_time"] = latency + elif name == "Fetch": + tx["backend_time"] = latency + else: + tx["fallback_time"] = max(tx.get("fallback_time", 0.0), latency) + + if tx: + self.finish_tx(tx) + + def parse_line(self, line): + m = re.match(r"^\s*-\s+([A-Za-z0-9_]+)(?:\s+(.*))?$", line) + + if not m: + return None + + tag = m.group(1) + value = (m.group(2) or "").strip() + return tag, value + + def cache_state(self, value): + value = value.upper().strip() + + if value in {"HIT", "MISS", "PASS", "PIPE", "SYNTH"}: + return value.lower() + + return None + + def parse_timestamp(self, value): + m = re.match(r"^([A-Za-z_]+):\s+\d+\.\d+\s+([0-9.]+)", value) + + if not m: + return None, None + + try: + return m.group(1), float(m.group(2)) + except ValueError: + return None, None + + def finish_tx(self, tx): + if not tx: + return + + method = normalize_method(tx.get("method", "UNKNOWN"), self.cfg) + status = status_class(tx.get("status", "0")) + cache = tx.get("cache", "unknown") + site = site_group(tx.get("host", ""), self.cfg) + latency = tx.get("response_time", tx.get("fallback_time")) + + self.metrics.record_http( + site=site, + method=method, + status_class_value=status, + cache=cache, + latency=latency, + weight=self.sample_weight, + ) + + +class Handler(BaseHTTPRequestHandler): + metrics = None + + def do_GET(self): + if self.path != "/metrics": + self.send_response(404) + self.end_headers() + return + + started = time.time() + body_text = self.metrics.render() + duration = time.time() - started + + self.metrics.set_gauge("varnish_exporter_render_duration_seconds", duration) + + body = body_text.encode("utf-8") + + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, fmt, *args): + return + + +def build_parser(): + epilog = """ +Examples: + + Only varnishstat: + python3 varnish_exporter.py --modules core,stat + + Test VSL without sampling: + sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full + + Recommended production mode: + sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard + + Debug with raw metrics: + sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1 + + With domain config: + sudo python3 varnish_exporter.py --config /etc/varnish-exporter/config.json --enable-vsl + +Modules: + core exporter self metrics + stat varnishstat -1 -j + vsl varnishlog -g request + domain derived per-domain statistics + raw raw request counters/histograms + +Profiles: + minimal small set: rps, hit ratio, backend ratio, 5xx, p95 + standard recommended: business-oriented domain statistics + full more detailed statistics + raw full + raw HTTP metrics +""" + + parser = argparse.ArgumentParser( + description="Varnish Prometheus Business Exporter - per-domain statistics from varnishstat and varnishlog.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=epilog, + ) + + parser.add_argument("--listen", default="0.0.0.0", help="HTTP listen address. Default: 0.0.0.0") + parser.add_argument("--port", type=int, default=9131, help="HTTP /metrics port. Default: 9131") + parser.add_argument("--instance", default="", help="Varnish instance for -n. Usually empty.") + parser.add_argument("--config", default="", help="Path to config.json with domain/site rules.") + parser.add_argument("--stat-interval", type=int, default=5, help="varnishstat interval in seconds. Default: 5") + + parser.add_argument( + "--modules", + type=parse_modules, + default=parse_modules("core,stat,vsl,domain"), + help="Modules to enable: core,stat,vsl,domain,raw or all. Default: core,stat,vsl,domain", + ) + + parser.add_argument( + "--profile", + choices=["minimal", "standard", "full", "raw"], + default="standard", + help="Domain metric detail level. Default: standard", + ) + + parser.add_argument("--enable-vsl", action="store_true", help="Enable varnishlog/VSL collector.") + parser.add_argument("--enable-varnishlog", action="store_true", help="Alias for --enable-vsl.") + parser.add_argument("--vsl-sample", type=float, default=0.001, help="VSL sampling: 1=100%%, 0.001=0.1%%. Default: 0.001") + parser.add_argument("--max-series", type=int, default=10000, help="Maximum number of series in the exporter. Default: 10000") + parser.add_argument("--window-seconds", type=int, default=60, help="Window for domain statistics. Default: 60") + parser.add_argument("--bucket-seconds", type=int, default=5, help="Internal bucket size for the domain window. Default: 5") + + return parser + + +def main(): + parser = build_parser() + args = parser.parse_args() + + modules = set(args.modules) + + if args.enable_vsl or args.enable_varnishlog: + modules.add("vsl") + + if "raw" in modules and args.profile != "raw": + args.profile = "raw" + + if args.vsl_sample <= 0 or args.vsl_sample > 1: + raise SystemExit("--vsl-sample must be in range 0 < x <= 1") + + if args.window_seconds < 10: + raise SystemExit("--window-seconds must be >= 10") + + if args.bucket_seconds < 1: + raise SystemExit("--bucket-seconds must be >= 1") + + cfg = load_config(args.config) + + metrics = Metrics( + buckets=cfg["histogram_buckets"], + max_series=args.max_series, + window_seconds=args.window_seconds, + bucket_seconds=args.bucket_seconds, + profile=args.profile, + modules=modules, + ) + + if "stat" in modules: + VarnishStatCollector( + metrics=metrics, + interval=args.stat_interval, + instance=args.instance, + ).start() + else: + metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat") + + if "vsl" in modules: + VarnishLogCollector( + metrics=metrics, + cfg=cfg, + instance=args.instance, + sample_rate=args.vsl_sample, + ).start() + else: + metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog") + + Handler.metrics = metrics + + server = ThreadingHTTPServer((args.listen, args.port), Handler) + print(f"listening on http://{args.listen}:{args.port}/metrics") + print(f"profile={args.profile}") + print(f"modules={','.join(sorted(modules))}") + server.serve_forever() + + +if __name__ == "__main__": + main()