Files
varnish_exporter/varnish_exporter.py
Mateusz Gruszczyński 7ed796f12c first commit
2026-06-26 11:01:55 +02:00

1093 lines
34 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Varnish Prometheus Business Exporter
Purpose:
A lightweight Prometheus exporter for Varnish focused not only on raw counters,
but also on ready-to-use business and operational statistics per domain/site.
Data sources:
1. varnishstat -1 -j
- global Varnish metrics
- storage, backends, cache hits/misses, workers, locks, etc.
2. varnishlog -g request
- sampled request stream
- per-domain aggregation
- hit/miss/pass/backend ratios
- latency p50/p90/p95/p99
- error ratios
- RPS
- saved backend RPS
Main domain metrics:
varnish_domain_rps
varnish_domain_hit_ratio
varnish_domain_miss_ratio
varnish_domain_pass_ratio
varnish_domain_backend_ratio
varnish_domain_cache_efficiency_ratio
varnish_domain_saved_backend_rps
varnish_domain_error_ratio
varnish_domain_4xx_ratio
varnish_domain_5xx_ratio
varnish_domain_avg_latency_seconds
varnish_domain_p50_latency_seconds
varnish_domain_p90_latency_seconds
varnish_domain_p95_latency_seconds
varnish_domain_p99_latency_seconds
varnish_domain_slow_100ms_ratio
varnish_domain_slow_250ms_ratio
varnish_domain_slow_500ms_ratio
varnish_domain_slow_1s_ratio
Profiles:
minimal:
- exporter health
- varnishstat
- per-domain: rps, hit_ratio, backend_ratio, 5xx_ratio, p95
standard:
- minimal +
- miss/pass ratio
- cache efficiency
- backend rps
- saved backend rps
- error ratio
- avg/p50/p90/p99 latency
- slow ratios
full:
- standard +
- per-cache average latency
- 2xx/3xx/4xx/5xx rps/ratio
- pipe/synth/unknown ratios
raw:
- full +
- raw varnish_http_requests_total
- raw varnish_http_request_duration_seconds
- varnish_domain_response_time_seconds histogram
Modules:
core:
- exporter self metrics
stat:
- varnishstat metrics
vsl:
- varnishlog collector
domain:
- derived per-domain metrics
raw:
- raw HTTP request counters/histograms
Defaults:
--modules core,stat,vsl,domain
--profile standard
--vsl-sample 0.001
Examples:
Local test without sampling:
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full
Production:
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard
Only varnishstat:
python3 varnish_exporter.py --modules core,stat
Only domain aggregates, no raw HTTP metrics:
sudo python3 varnish_exporter.py --modules core,stat,vsl,domain --profile standard
Debug with raw metrics:
sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1
JSON config example:
{
"site_rules": [
{
"match": "(^|\\\\.)example\\\\.com$",
"site": "example_com"
},
{
"match": "(^|\\\\.)static\\\\.example\\\\.com$",
"site": "static_example_com"
}
],
"default_site": "other",
"allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
"histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
}
Notes:
- Varnish cannot see real browser render time.
- Latency here means server-side response time as seen by Varnish.
- For very high traffic, do not use --vsl-sample 1 in production.
- Sensible production values: 0.001 or 0.0001.
"""
import argparse
import json
import random
import re
import subprocess
import threading
import time
from collections import defaultdict
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
DEFAULT_CONFIG = {
"site_rules": [],
"default_site": "other",
"allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
"histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
}
VALID_MODULES = {"core", "stat", "vsl", "domain", "raw"}
def load_config(path):
if not path:
cfg = dict(DEFAULT_CONFIG)
else:
with open(path, "r", encoding="utf-8") as f:
cfg = json.load(f)
cfg.setdefault("site_rules", [])
cfg.setdefault("default_site", "other")
cfg.setdefault("allowed_methods", DEFAULT_CONFIG["allowed_methods"])
cfg.setdefault("histogram_buckets", DEFAULT_CONFIG["histogram_buckets"])
cfg["site_rules_compiled"] = [
(re.compile(x["match"], re.I), x["site"])
for x in cfg.get("site_rules", [])
]
cfg["allowed_methods_set"] = set(cfg["allowed_methods"])
return cfg
def parse_modules(value):
modules = set()
for item in value.split(","):
item = item.strip().lower()
if not item:
continue
if item == "all":
return set(VALID_MODULES)
if item not in VALID_MODULES:
raise argparse.ArgumentTypeError(
f"unknown module: {item}; available modules: {','.join(sorted(VALID_MODULES))}"
)
modules.add(item)
if not modules:
raise argparse.ArgumentTypeError("module list cannot be empty")
return modules
def prom_escape(value):
return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
def prom_name(name):
name = str(name).lower()
name = re.sub(r"[^a-z0-9_]", "_", name)
name = re.sub(r"_+", "_", name).strip("_")
if name and name[0].isdigit():
name = "_" + name
return name or "unknown"
def status_class(status):
try:
code = int(status)
except Exception:
return "unknown"
if 100 <= code <= 599:
return f"{code // 100}xx"
return "unknown"
def normalize_method(method, cfg):
method = str(method or "UNKNOWN").upper()
return method if method in cfg["allowed_methods_set"] else "OTHER"
def normalize_host(host):
host = str(host or "").lower().strip().rstrip(".")
if not host:
return ""
if host.startswith("[") and "]" in host:
return host[1:host.index("]")]
if ":" in host:
host = host.split(":", 1)[0]
return host
def site_group(host, cfg):
host = normalize_host(host)
if not host:
return cfg.get("default_site", "other")
for regex, site in cfg["site_rules_compiled"]:
if regex.search(host):
return site
return host
def format_metric(name, labels, value):
if labels:
label_str = ",".join(
f'{k}="{prom_escape(v)}"'
for k, v in sorted(labels)
)
return f"{name}{{{label_str}}} {value}"
return f"{name} {value}"
def estimate_quantile_from_buckets(buckets, count, q):
if count <= 0:
return 0.0
finite = sorted(b for b in buckets.keys() if b != float("inf"))
if not finite:
return 0.0
target = count * q
prev_le = 0.0
prev_count = 0.0
for le in finite:
current_count = buckets.get(le, 0.0)
if current_count >= target:
if current_count <= prev_count:
return le
fraction = (target - prev_count) / (current_count - prev_count)
return prev_le + ((le - prev_le) * fraction)
prev_le = le
prev_count = current_count
return finite[-1]
class Metrics:
def __init__(self, buckets, max_series, window_seconds, bucket_seconds, profile, modules):
self.lock = threading.RLock()
self.buckets = buckets
self.max_series = max_series
self.window_seconds = window_seconds
self.bucket_seconds = bucket_seconds
self.profile = profile
self.modules = modules
self.gauges = {}
self.counters = {}
self.histograms = defaultdict(lambda: {
"buckets": defaultdict(float),
"sum": 0.0,
"count": 0.0,
})
self.series_seen = set()
self.dropped_series = 0
self.domain_window = defaultdict(lambda: defaultdict(float))
def module_enabled(self, name):
return name in self.modules
def emit_raw(self):
return self.profile == "raw" or self.module_enabled("raw")
def emit_minimal(self):
return self.profile in ("minimal", "standard", "full", "raw")
def emit_standard(self):
return self.profile in ("standard", "full", "raw")
def emit_full(self):
return self.profile in ("full", "raw")
def _label_key(self, labels):
return tuple(sorted((str(k), str(v)) for k, v in labels.items()))
def _allow_series(self, metric, labels):
key = (metric, labels)
if key in self.series_seen:
return True
if len(self.series_seen) >= self.max_series:
self.dropped_series += 1
return False
self.series_seen.add(key)
return True
def set_gauge(self, name, value, **labels):
labels_key = self._label_key(labels)
with self.lock:
if not self._allow_series(name, labels_key):
return
self.gauges[(name, labels_key)] = float(value)
def set_counter(self, name, value, **labels):
if not name.endswith("_total"):
name += "_total"
labels_key = self._label_key(labels)
with self.lock:
if not self._allow_series(name, labels_key):
return
self.counters[(name, labels_key)] = float(value)
def inc_counter(self, name, amount=1.0, **labels):
if not name.endswith("_total"):
name += "_total"
labels_key = self._label_key(labels)
with self.lock:
if not self._allow_series(name, labels_key):
return
self.counters[(name, labels_key)] = self.counters.get((name, labels_key), 0.0) + float(amount)
def observe(self, name, value, weight=1.0, **labels):
labels_key = self._label_key(labels)
with self.lock:
if not self._allow_series(name, labels_key):
return
h = self.histograms[(name, labels_key)]
h["sum"] += float(value) * float(weight)
h["count"] += float(weight)
for b in self.buckets:
if value <= b:
h["buckets"][b] += float(weight)
h["buckets"][float("inf")] += float(weight)
def record_http(self, site, method, status_class_value, cache, latency, weight):
raw_labels = {
"site": site,
"method": method,
"status_class": status_class_value,
"cache": cache,
}
if self.emit_raw():
self.inc_counter("varnish_http_requests_total", weight, **raw_labels)
if latency is not None:
self.observe(
"varnish_http_request_duration_seconds",
latency,
weight=weight,
**raw_labels,
)
self.observe(
"varnish_domain_response_time_seconds",
latency,
weight=weight,
site=site,
)
if not self.module_enabled("domain"):
return
now = int(time.time())
bucket_ts = now - (now % self.bucket_seconds)
key = (bucket_ts, site)
with self.lock:
b = self.domain_window[key]
b["total"] += weight
b[f"cache_{cache}"] += weight
if cache in ("hit", "miss"):
b["cacheable"] += weight
if cache in ("miss", "pass"):
b["backend"] += weight
if status_class_value == "2xx":
b["2xx"] += weight
elif status_class_value == "3xx":
b["3xx"] += weight
elif status_class_value == "4xx":
b["4xx"] += weight
b["errors"] += weight
elif status_class_value == "5xx":
b["5xx"] += weight
b["errors"] += weight
if latency is not None:
b["latency_sum"] += latency * weight
b["latency_count"] += weight
b[f"cache_{cache}_latency_sum"] += latency * weight
b[f"cache_{cache}_latency_count"] += weight
if latency > 0.05:
b["slow_50ms"] += weight
if latency > 0.1:
b["slow_100ms"] += weight
if latency > 0.25:
b["slow_250ms"] += weight
if latency > 0.5:
b["slow_500ms"] += weight
if latency > 1.0:
b["slow_1s"] += weight
if latency > 2.5:
b["slow_2500ms"] += weight
if latency > 5.0:
b["slow_5s"] += weight
for le in self.buckets:
if latency <= le:
b[f"latency_le_{le}"] += weight
b["latency_le_inf"] += weight
self.cleanup_window_locked(now)
def cleanup_window_locked(self, now):
min_ts = now - self.window_seconds - self.bucket_seconds
for key in list(self.domain_window.keys()):
bucket_ts, _site = key
if bucket_ts < min_ts:
del self.domain_window[key]
def calculate_domain_window_stats(self):
if not self.module_enabled("domain"):
return {}
now = int(time.time())
min_ts = now - self.window_seconds
stats = defaultdict(lambda: defaultdict(float))
with self.lock:
self.cleanup_window_locked(now)
items = list(self.domain_window.items())
for (bucket_ts, site), values in items:
if bucket_ts < min_ts:
continue
s = stats[site]
for k, v in values.items():
s[k] += float(v)
out = {}
for site, s in stats.items():
total = s.get("total", 0.0)
if total <= 0:
continue
labels = (("site", site),)
window = float(self.window_seconds)
hit = s.get("cache_hit", 0.0)
miss = s.get("cache_miss", 0.0)
passed = s.get("cache_pass", 0.0)
pipe = s.get("cache_pipe", 0.0)
synth = s.get("cache_synth", 0.0)
unknown_cache = s.get("cache_unknown", 0.0)
backend = s.get("backend", 0.0)
cacheable = s.get("cacheable", 0.0)
e2xx = s.get("2xx", 0.0)
e3xx = s.get("3xx", 0.0)
e4xx = s.get("4xx", 0.0)
e5xx = s.get("5xx", 0.0)
errors = s.get("errors", 0.0)
out[("varnish_domain_rps", labels)] = total / window
out[("varnish_domain_hit_ratio", labels)] = hit / total
out[("varnish_domain_backend_ratio", labels)] = backend / total
out[("varnish_domain_5xx_ratio", labels)] = e5xx / total
latency_count = s.get("latency_count", 0.0)
latency_buckets = None
if latency_count > 0:
latency_buckets = {}
for le in self.buckets:
latency_buckets[le] = s.get(f"latency_le_{le}", 0.0)
latency_buckets[float("inf")] = s.get("latency_le_inf", 0.0)
out[("varnish_domain_p95_latency_seconds", labels)] = estimate_quantile_from_buckets(
latency_buckets,
latency_count,
0.95,
)
if self.emit_standard():
out[("varnish_domain_requests_per_second", labels)] = total / window
out[("varnish_domain_hit_rps", labels)] = hit / window
out[("varnish_domain_miss_rps", labels)] = miss / window
out[("varnish_domain_pass_rps", labels)] = passed / window
out[("varnish_domain_backend_rps", labels)] = backend / window
out[("varnish_domain_saved_backend_rps", labels)] = hit / window
out[("varnish_domain_miss_ratio", labels)] = miss / total
out[("varnish_domain_pass_ratio", labels)] = passed / total
out[("varnish_domain_backend_ratio", labels)] = backend / total
out[("varnish_domain_cacheable_ratio", labels)] = cacheable / total
out[("varnish_domain_4xx_ratio", labels)] = e4xx / total
out[("varnish_domain_error_ratio", labels)] = errors / total
if cacheable > 0:
out[("varnish_domain_cache_efficiency_ratio", labels)] = hit / cacheable
else:
out[("varnish_domain_cache_efficiency_ratio", labels)] = 0.0
if latency_count > 0 and latency_buckets is not None:
out[("varnish_domain_avg_latency_seconds", labels)] = s.get("latency_sum", 0.0) / latency_count
out[("varnish_domain_latency_observed_ratio", labels)] = latency_count / total
out[("varnish_domain_p50_latency_seconds", labels)] = estimate_quantile_from_buckets(
latency_buckets,
latency_count,
0.50,
)
out[("varnish_domain_p90_latency_seconds", labels)] = estimate_quantile_from_buckets(
latency_buckets,
latency_count,
0.90,
)
out[("varnish_domain_p99_latency_seconds", labels)] = estimate_quantile_from_buckets(
latency_buckets,
latency_count,
0.99,
)
out[("varnish_domain_slow_100ms_ratio", labels)] = s.get("slow_100ms", 0.0) / latency_count
out[("varnish_domain_slow_250ms_ratio", labels)] = s.get("slow_250ms", 0.0) / latency_count
out[("varnish_domain_slow_500ms_ratio", labels)] = s.get("slow_500ms", 0.0) / latency_count
out[("varnish_domain_slow_1s_ratio", labels)] = s.get("slow_1s", 0.0) / latency_count
if self.emit_full():
out[("varnish_domain_pipe_rps", labels)] = pipe / window
out[("varnish_domain_synth_rps", labels)] = synth / window
out[("varnish_domain_unknown_cache_rps", labels)] = unknown_cache / window
out[("varnish_domain_pipe_ratio", labels)] = pipe / total
out[("varnish_domain_synth_ratio", labels)] = synth / total
out[("varnish_domain_unknown_cache_ratio", labels)] = unknown_cache / total
out[("varnish_domain_2xx_ratio", labels)] = e2xx / total
out[("varnish_domain_3xx_ratio", labels)] = e3xx / total
out[("varnish_domain_2xx_rps", labels)] = e2xx / window
out[("varnish_domain_3xx_rps", labels)] = e3xx / window
out[("varnish_domain_4xx_rps", labels)] = e4xx / window
out[("varnish_domain_5xx_rps", labels)] = e5xx / window
out[("varnish_domain_error_rps", labels)] = errors / window
if latency_count > 0:
out[("varnish_domain_slow_50ms_ratio", labels)] = s.get("slow_50ms", 0.0) / latency_count
out[("varnish_domain_slow_2500ms_ratio", labels)] = s.get("slow_2500ms", 0.0) / latency_count
out[("varnish_domain_slow_5s_ratio", labels)] = s.get("slow_5s", 0.0) / latency_count
for cache_name in ("hit", "miss", "pass", "pipe", "synth", "unknown"):
cache_latency_count = s.get(f"cache_{cache_name}_latency_count", 0.0)
cache_latency_sum = s.get(f"cache_{cache_name}_latency_sum", 0.0)
if cache_latency_count > 0:
metric = f"varnish_domain_{cache_name}_avg_latency_seconds"
out[(metric, labels)] = cache_latency_sum / cache_latency_count
return out
def render(self):
with self.lock:
gauges = dict(self.gauges)
counters = dict(self.counters)
histograms = {
k: {
"buckets": dict(v["buckets"]),
"sum": v["sum"],
"count": v["count"],
}
for k, v in self.histograms.items()
}
series_count = len(self.series_seen)
dropped_series = self.dropped_series
derived = self.calculate_domain_window_stats()
for key, value in derived.items():
gauges[key] = value
out = []
if self.module_enabled("core"):
out.append("# TYPE varnish_exporter_series gauge")
out.append(f"varnish_exporter_series {series_count}")
out.append("# TYPE varnish_exporter_dropped_series_total counter")
out.append(f"varnish_exporter_dropped_series_total {dropped_series}")
out.append("# TYPE varnish_exporter_window_seconds gauge")
out.append(f"varnish_exporter_window_seconds {self.window_seconds}")
out.append("# TYPE varnish_exporter_profile gauge")
out.append(f'varnish_exporter_profile{{profile="{self.profile}"}} 1')
for module in sorted(self.modules):
out.append(f'varnish_exporter_module_enabled{{module="{module}"}} 1')
typed = set()
for (name, labels), value in sorted(gauges.items()):
if name not in typed:
out.append(f"# TYPE {name} gauge")
typed.add(name)
out.append(format_metric(name, labels, value))
for (name, labels), value in sorted(counters.items()):
if name not in typed:
out.append(f"# TYPE {name} counter")
typed.add(name)
out.append(format_metric(name, labels, value))
for (name, labels), h in sorted(histograms.items()):
if name not in typed:
out.append(f"# TYPE {name} histogram")
typed.add(name)
base_labels = dict(labels)
for b in self.buckets:
lb = dict(base_labels)
lb["le"] = str(b)
out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(b, 0.0)))
lb = dict(base_labels)
lb["le"] = "+Inf"
out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(float("inf"), 0.0)))
out.append(format_metric(name + "_sum", labels, h["sum"]))
out.append(format_metric(name + "_count", labels, h["count"]))
return "\n".join(out) + "\n"
class VarnishStatCollector(threading.Thread):
def __init__(self, metrics, interval, instance):
super().__init__(daemon=True)
self.metrics = metrics
self.interval = interval
self.instance = instance
def run(self):
while True:
started = time.time()
try:
self.collect()
self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishstat")
except Exception:
self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat")
self.metrics.set_gauge(
"varnish_exporter_collector_duration_seconds",
time.time() - started,
collector="varnishstat",
)
time.sleep(self.interval)
def collect(self):
cmd = ["varnishstat", "-1", "-j"]
if self.instance:
cmd.extend(["-n", self.instance])
raw = subprocess.check_output(cmd, text=True, timeout=10)
data = json.loads(raw)
for key, item in data.items():
if not isinstance(item, dict) or "value" not in item:
continue
value = item.get("value", 0)
flag = item.get("flag", "g")
metric, labels = self.metric_from_key(key)
if flag == "c":
self.metrics.set_counter(metric, value, **labels)
else:
self.metrics.set_gauge(metric, value, **labels)
def metric_from_key(self, key):
parts = key.split(".")
section = prom_name(parts[0])
if len(parts) == 2:
return f"varnish_{section}_{prom_name(parts[1])}", {}
labeled_sections = {
"VBE": "backend",
"SMA": "storage",
"SMF": "storage",
"MSE": "storage",
"LCK": "lock",
}
if parts[0] in labeled_sections and len(parts) >= 3:
label_name = labeled_sections[parts[0]]
object_name = ".".join(parts[1:-1])
field = parts[-1]
return f"varnish_{section}_{prom_name(field)}", {label_name: object_name}
return f"varnish_{section}_{prom_name('_'.join(parts[1:]))}", {}
class VarnishLogCollector(threading.Thread):
def __init__(self, metrics, cfg, instance, sample_rate):
super().__init__(daemon=True)
self.metrics = metrics
self.cfg = cfg
self.instance = instance
self.sample_rate = sample_rate
self.sample_weight = 1.0 / sample_rate if sample_rate > 0 else 0.0
def run(self):
while True:
try:
self.stream()
except Exception:
self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog")
time.sleep(2)
def stream(self):
cmd = [
"varnishlog",
"-g", "request",
"-i", "ReqMethod,ReqHeader,RespStatus,VCL_call,Timestamp,End",
]
if self.instance:
cmd.extend(["-n", self.instance])
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
bufsize=1,
)
self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishlog")
tx = None
sampled = False
for line in proc.stdout:
line = line.rstrip("\n")
if "<< Request" in line:
if tx:
self.finish_tx(tx)
sampled = random.random() < self.sample_rate
tx = {} if sampled else None
continue
if not sampled or tx is None:
continue
parsed = self.parse_line(line)
if not parsed:
continue
tag, value = parsed
if tag == "End":
self.finish_tx(tx)
tx = None
sampled = False
continue
if tag == "ReqMethod":
tx["method"] = value.split()[0] if value else "UNKNOWN"
elif tag == "RespStatus":
tx["status"] = value.split()[0] if value else "0"
elif tag == "ReqHeader":
low = value.lower()
if low.startswith("host:"):
tx["host"] = value.split(":", 1)[1].strip()
elif tag == "VCL_call":
cache = self.cache_state(value)
if cache:
tx["cache"] = cache
elif tag == "Timestamp":
name, latency = self.parse_timestamp(value)
if latency is not None:
if name == "Resp":
tx["response_time"] = latency
elif name == "Fetch":
tx["backend_time"] = latency
else:
tx["fallback_time"] = max(tx.get("fallback_time", 0.0), latency)
if tx:
self.finish_tx(tx)
def parse_line(self, line):
m = re.match(r"^\s*-\s+([A-Za-z0-9_]+)(?:\s+(.*))?$", line)
if not m:
return None
tag = m.group(1)
value = (m.group(2) or "").strip()
return tag, value
def cache_state(self, value):
value = value.upper().strip()
if value in {"HIT", "MISS", "PASS", "PIPE", "SYNTH"}:
return value.lower()
return None
def parse_timestamp(self, value):
m = re.match(r"^([A-Za-z_]+):\s+\d+\.\d+\s+([0-9.]+)", value)
if not m:
return None, None
try:
return m.group(1), float(m.group(2))
except ValueError:
return None, None
def finish_tx(self, tx):
if not tx:
return
method = normalize_method(tx.get("method", "UNKNOWN"), self.cfg)
status = status_class(tx.get("status", "0"))
cache = tx.get("cache", "unknown")
site = site_group(tx.get("host", ""), self.cfg)
latency = tx.get("response_time", tx.get("fallback_time"))
self.metrics.record_http(
site=site,
method=method,
status_class_value=status,
cache=cache,
latency=latency,
weight=self.sample_weight,
)
class Handler(BaseHTTPRequestHandler):
metrics = None
def do_GET(self):
if self.path != "/metrics":
self.send_response(404)
self.end_headers()
return
started = time.time()
body_text = self.metrics.render()
duration = time.time() - started
self.metrics.set_gauge("varnish_exporter_render_duration_seconds", duration)
body = body_text.encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, fmt, *args):
return
def build_parser():
epilog = """
Examples:
Only varnishstat:
python3 varnish_exporter.py --modules core,stat
Test VSL without sampling:
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full
Recommended production mode:
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard
Debug with raw metrics:
sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1
With domain config:
sudo python3 varnish_exporter.py --config /etc/varnish-exporter/config.json --enable-vsl
Modules:
core exporter self metrics
stat varnishstat -1 -j
vsl varnishlog -g request
domain derived per-domain statistics
raw raw request counters/histograms
Profiles:
minimal small set: rps, hit ratio, backend ratio, 5xx, p95
standard recommended: business-oriented domain statistics
full more detailed statistics
raw full + raw HTTP metrics
"""
parser = argparse.ArgumentParser(
description="Varnish Prometheus Business Exporter - per-domain statistics from varnishstat and varnishlog.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=epilog,
)
parser.add_argument("--listen", default="0.0.0.0", help="HTTP listen address. Default: 0.0.0.0")
parser.add_argument("--port", type=int, default=9131, help="HTTP /metrics port. Default: 9131")
parser.add_argument("--instance", default="", help="Varnish instance for -n. Usually empty.")
parser.add_argument("--config", default="", help="Path to config.json with domain/site rules.")
parser.add_argument("--stat-interval", type=int, default=5, help="varnishstat interval in seconds. Default: 5")
parser.add_argument(
"--modules",
type=parse_modules,
default=parse_modules("core,stat,vsl,domain"),
help="Modules to enable: core,stat,vsl,domain,raw or all. Default: core,stat,vsl,domain",
)
parser.add_argument(
"--profile",
choices=["minimal", "standard", "full", "raw"],
default="standard",
help="Domain metric detail level. Default: standard",
)
parser.add_argument("--enable-vsl", action="store_true", help="Enable varnishlog/VSL collector.")
parser.add_argument("--enable-varnishlog", action="store_true", help="Alias for --enable-vsl.")
parser.add_argument("--vsl-sample", type=float, default=0.001, help="VSL sampling: 1=100%%, 0.001=0.1%%. Default: 0.001")
parser.add_argument("--max-series", type=int, default=10000, help="Maximum number of series in the exporter. Default: 10000")
parser.add_argument("--window-seconds", type=int, default=60, help="Window for domain statistics. Default: 60")
parser.add_argument("--bucket-seconds", type=int, default=5, help="Internal bucket size for the domain window. Default: 5")
return parser
def main():
parser = build_parser()
args = parser.parse_args()
modules = set(args.modules)
if args.enable_vsl or args.enable_varnishlog:
modules.add("vsl")
if "raw" in modules and args.profile != "raw":
args.profile = "raw"
if args.vsl_sample <= 0 or args.vsl_sample > 1:
raise SystemExit("--vsl-sample must be in range 0 < x <= 1")
if args.window_seconds < 10:
raise SystemExit("--window-seconds must be >= 10")
if args.bucket_seconds < 1:
raise SystemExit("--bucket-seconds must be >= 1")
cfg = load_config(args.config)
metrics = Metrics(
buckets=cfg["histogram_buckets"],
max_series=args.max_series,
window_seconds=args.window_seconds,
bucket_seconds=args.bucket_seconds,
profile=args.profile,
modules=modules,
)
if "stat" in modules:
VarnishStatCollector(
metrics=metrics,
interval=args.stat_interval,
instance=args.instance,
).start()
else:
metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat")
if "vsl" in modules:
VarnishLogCollector(
metrics=metrics,
cfg=cfg,
instance=args.instance,
sample_rate=args.vsl_sample,
).start()
else:
metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog")
Handler.metrics = metrics
server = ThreadingHTTPServer((args.listen, args.port), Handler)
print(f"listening on http://{args.listen}:{args.port}/metrics")
print(f"profile={args.profile}")
print(f"modules={','.join(sorted(modules))}")
server.serve_forever()
if __name__ == "__main__":
main()