1093 lines
34 KiB
Python
Executable File
1093 lines
34 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Varnish Prometheus Business Exporter
|
|
|
|
Purpose:
|
|
A lightweight Prometheus exporter for Varnish focused not only on raw counters,
|
|
but also on ready-to-use business and operational statistics per domain/site.
|
|
|
|
Data sources:
|
|
1. varnishstat -1 -j
|
|
- global Varnish metrics
|
|
- storage, backends, cache hits/misses, workers, locks, etc.
|
|
|
|
2. varnishlog -g request
|
|
- sampled request stream
|
|
- per-domain aggregation
|
|
- hit/miss/pass/backend ratios
|
|
- latency p50/p90/p95/p99
|
|
- error ratios
|
|
- RPS
|
|
- saved backend RPS
|
|
|
|
Main domain metrics:
|
|
varnish_domain_rps
|
|
varnish_domain_hit_ratio
|
|
varnish_domain_miss_ratio
|
|
varnish_domain_pass_ratio
|
|
varnish_domain_backend_ratio
|
|
varnish_domain_cache_efficiency_ratio
|
|
varnish_domain_saved_backend_rps
|
|
varnish_domain_error_ratio
|
|
varnish_domain_4xx_ratio
|
|
varnish_domain_5xx_ratio
|
|
varnish_domain_avg_latency_seconds
|
|
varnish_domain_p50_latency_seconds
|
|
varnish_domain_p90_latency_seconds
|
|
varnish_domain_p95_latency_seconds
|
|
varnish_domain_p99_latency_seconds
|
|
varnish_domain_slow_100ms_ratio
|
|
varnish_domain_slow_250ms_ratio
|
|
varnish_domain_slow_500ms_ratio
|
|
varnish_domain_slow_1s_ratio
|
|
|
|
Profiles:
|
|
minimal:
|
|
- exporter health
|
|
- varnishstat
|
|
- per-domain: rps, hit_ratio, backend_ratio, 5xx_ratio, p95
|
|
|
|
standard:
|
|
- minimal +
|
|
- miss/pass ratio
|
|
- cache efficiency
|
|
- backend rps
|
|
- saved backend rps
|
|
- error ratio
|
|
- avg/p50/p90/p99 latency
|
|
- slow ratios
|
|
|
|
full:
|
|
- standard +
|
|
- per-cache average latency
|
|
- 2xx/3xx/4xx/5xx rps/ratio
|
|
- pipe/synth/unknown ratios
|
|
|
|
raw:
|
|
- full +
|
|
- raw varnish_http_requests_total
|
|
- raw varnish_http_request_duration_seconds
|
|
- varnish_domain_response_time_seconds histogram
|
|
|
|
Modules:
|
|
core:
|
|
- exporter self metrics
|
|
|
|
stat:
|
|
- varnishstat metrics
|
|
|
|
vsl:
|
|
- varnishlog collector
|
|
|
|
domain:
|
|
- derived per-domain metrics
|
|
|
|
raw:
|
|
- raw HTTP request counters/histograms
|
|
|
|
Defaults:
|
|
--modules core,stat,vsl,domain
|
|
--profile standard
|
|
--vsl-sample 0.001
|
|
|
|
Examples:
|
|
|
|
Local test without sampling:
|
|
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full
|
|
|
|
Production:
|
|
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard
|
|
|
|
Only varnishstat:
|
|
python3 varnish_exporter.py --modules core,stat
|
|
|
|
Only domain aggregates, no raw HTTP metrics:
|
|
sudo python3 varnish_exporter.py --modules core,stat,vsl,domain --profile standard
|
|
|
|
Debug with raw metrics:
|
|
sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1
|
|
|
|
JSON config example:
|
|
{
|
|
"site_rules": [
|
|
{
|
|
"match": "(^|\\\\.)example\\\\.com$",
|
|
"site": "example_com"
|
|
},
|
|
{
|
|
"match": "(^|\\\\.)static\\\\.example\\\\.com$",
|
|
"site": "static_example_com"
|
|
}
|
|
],
|
|
"default_site": "other",
|
|
"allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
|
|
"histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
|
|
}
|
|
|
|
Notes:
|
|
- Varnish cannot see real browser render time.
|
|
- Latency here means server-side response time as seen by Varnish.
|
|
- For very high traffic, do not use --vsl-sample 1 in production.
|
|
- Sensible production values: 0.001 or 0.0001.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import random
|
|
import re
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
from collections import defaultdict
|
|
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
|
|
|
|
DEFAULT_CONFIG = {
|
|
"site_rules": [],
|
|
"default_site": "other",
|
|
"allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
|
|
"histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
|
}
|
|
|
|
|
|
VALID_MODULES = {"core", "stat", "vsl", "domain", "raw"}
|
|
|
|
|
|
def load_config(path):
|
|
if not path:
|
|
cfg = dict(DEFAULT_CONFIG)
|
|
else:
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
cfg = json.load(f)
|
|
|
|
cfg.setdefault("site_rules", [])
|
|
cfg.setdefault("default_site", "other")
|
|
cfg.setdefault("allowed_methods", DEFAULT_CONFIG["allowed_methods"])
|
|
cfg.setdefault("histogram_buckets", DEFAULT_CONFIG["histogram_buckets"])
|
|
|
|
cfg["site_rules_compiled"] = [
|
|
(re.compile(x["match"], re.I), x["site"])
|
|
for x in cfg.get("site_rules", [])
|
|
]
|
|
cfg["allowed_methods_set"] = set(cfg["allowed_methods"])
|
|
return cfg
|
|
|
|
|
|
def parse_modules(value):
|
|
modules = set()
|
|
|
|
for item in value.split(","):
|
|
item = item.strip().lower()
|
|
|
|
if not item:
|
|
continue
|
|
|
|
if item == "all":
|
|
return set(VALID_MODULES)
|
|
|
|
if item not in VALID_MODULES:
|
|
raise argparse.ArgumentTypeError(
|
|
f"unknown module: {item}; available modules: {','.join(sorted(VALID_MODULES))}"
|
|
)
|
|
|
|
modules.add(item)
|
|
|
|
if not modules:
|
|
raise argparse.ArgumentTypeError("module list cannot be empty")
|
|
|
|
return modules
|
|
|
|
|
|
def prom_escape(value):
|
|
return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
|
|
|
|
|
def prom_name(name):
|
|
name = str(name).lower()
|
|
name = re.sub(r"[^a-z0-9_]", "_", name)
|
|
name = re.sub(r"_+", "_", name).strip("_")
|
|
if name and name[0].isdigit():
|
|
name = "_" + name
|
|
return name or "unknown"
|
|
|
|
|
|
def status_class(status):
|
|
try:
|
|
code = int(status)
|
|
except Exception:
|
|
return "unknown"
|
|
|
|
if 100 <= code <= 599:
|
|
return f"{code // 100}xx"
|
|
|
|
return "unknown"
|
|
|
|
|
|
def normalize_method(method, cfg):
|
|
method = str(method or "UNKNOWN").upper()
|
|
return method if method in cfg["allowed_methods_set"] else "OTHER"
|
|
|
|
|
|
def normalize_host(host):
|
|
host = str(host or "").lower().strip().rstrip(".")
|
|
|
|
if not host:
|
|
return ""
|
|
|
|
if host.startswith("[") and "]" in host:
|
|
return host[1:host.index("]")]
|
|
|
|
if ":" in host:
|
|
host = host.split(":", 1)[0]
|
|
|
|
return host
|
|
|
|
|
|
def site_group(host, cfg):
|
|
host = normalize_host(host)
|
|
|
|
if not host:
|
|
return cfg.get("default_site", "other")
|
|
|
|
for regex, site in cfg["site_rules_compiled"]:
|
|
if regex.search(host):
|
|
return site
|
|
|
|
return host
|
|
|
|
|
|
def format_metric(name, labels, value):
|
|
if labels:
|
|
label_str = ",".join(
|
|
f'{k}="{prom_escape(v)}"'
|
|
for k, v in sorted(labels)
|
|
)
|
|
return f"{name}{{{label_str}}} {value}"
|
|
|
|
return f"{name} {value}"
|
|
|
|
|
|
def estimate_quantile_from_buckets(buckets, count, q):
|
|
if count <= 0:
|
|
return 0.0
|
|
|
|
finite = sorted(b for b in buckets.keys() if b != float("inf"))
|
|
|
|
if not finite:
|
|
return 0.0
|
|
|
|
target = count * q
|
|
prev_le = 0.0
|
|
prev_count = 0.0
|
|
|
|
for le in finite:
|
|
current_count = buckets.get(le, 0.0)
|
|
|
|
if current_count >= target:
|
|
if current_count <= prev_count:
|
|
return le
|
|
|
|
fraction = (target - prev_count) / (current_count - prev_count)
|
|
return prev_le + ((le - prev_le) * fraction)
|
|
|
|
prev_le = le
|
|
prev_count = current_count
|
|
|
|
return finite[-1]
|
|
|
|
|
|
class Metrics:
|
|
def __init__(self, buckets, max_series, window_seconds, bucket_seconds, profile, modules):
|
|
self.lock = threading.RLock()
|
|
self.buckets = buckets
|
|
self.max_series = max_series
|
|
self.window_seconds = window_seconds
|
|
self.bucket_seconds = bucket_seconds
|
|
self.profile = profile
|
|
self.modules = modules
|
|
|
|
self.gauges = {}
|
|
self.counters = {}
|
|
self.histograms = defaultdict(lambda: {
|
|
"buckets": defaultdict(float),
|
|
"sum": 0.0,
|
|
"count": 0.0,
|
|
})
|
|
|
|
self.series_seen = set()
|
|
self.dropped_series = 0
|
|
|
|
self.domain_window = defaultdict(lambda: defaultdict(float))
|
|
|
|
def module_enabled(self, name):
|
|
return name in self.modules
|
|
|
|
def emit_raw(self):
|
|
return self.profile == "raw" or self.module_enabled("raw")
|
|
|
|
def emit_minimal(self):
|
|
return self.profile in ("minimal", "standard", "full", "raw")
|
|
|
|
def emit_standard(self):
|
|
return self.profile in ("standard", "full", "raw")
|
|
|
|
def emit_full(self):
|
|
return self.profile in ("full", "raw")
|
|
|
|
def _label_key(self, labels):
|
|
return tuple(sorted((str(k), str(v)) for k, v in labels.items()))
|
|
|
|
def _allow_series(self, metric, labels):
|
|
key = (metric, labels)
|
|
|
|
if key in self.series_seen:
|
|
return True
|
|
|
|
if len(self.series_seen) >= self.max_series:
|
|
self.dropped_series += 1
|
|
return False
|
|
|
|
self.series_seen.add(key)
|
|
return True
|
|
|
|
def set_gauge(self, name, value, **labels):
|
|
labels_key = self._label_key(labels)
|
|
|
|
with self.lock:
|
|
if not self._allow_series(name, labels_key):
|
|
return
|
|
self.gauges[(name, labels_key)] = float(value)
|
|
|
|
def set_counter(self, name, value, **labels):
|
|
if not name.endswith("_total"):
|
|
name += "_total"
|
|
|
|
labels_key = self._label_key(labels)
|
|
|
|
with self.lock:
|
|
if not self._allow_series(name, labels_key):
|
|
return
|
|
self.counters[(name, labels_key)] = float(value)
|
|
|
|
def inc_counter(self, name, amount=1.0, **labels):
|
|
if not name.endswith("_total"):
|
|
name += "_total"
|
|
|
|
labels_key = self._label_key(labels)
|
|
|
|
with self.lock:
|
|
if not self._allow_series(name, labels_key):
|
|
return
|
|
self.counters[(name, labels_key)] = self.counters.get((name, labels_key), 0.0) + float(amount)
|
|
|
|
def observe(self, name, value, weight=1.0, **labels):
|
|
labels_key = self._label_key(labels)
|
|
|
|
with self.lock:
|
|
if not self._allow_series(name, labels_key):
|
|
return
|
|
|
|
h = self.histograms[(name, labels_key)]
|
|
h["sum"] += float(value) * float(weight)
|
|
h["count"] += float(weight)
|
|
|
|
for b in self.buckets:
|
|
if value <= b:
|
|
h["buckets"][b] += float(weight)
|
|
|
|
h["buckets"][float("inf")] += float(weight)
|
|
|
|
def record_http(self, site, method, status_class_value, cache, latency, weight):
|
|
raw_labels = {
|
|
"site": site,
|
|
"method": method,
|
|
"status_class": status_class_value,
|
|
"cache": cache,
|
|
}
|
|
|
|
if self.emit_raw():
|
|
self.inc_counter("varnish_http_requests_total", weight, **raw_labels)
|
|
|
|
if latency is not None:
|
|
self.observe(
|
|
"varnish_http_request_duration_seconds",
|
|
latency,
|
|
weight=weight,
|
|
**raw_labels,
|
|
)
|
|
|
|
self.observe(
|
|
"varnish_domain_response_time_seconds",
|
|
latency,
|
|
weight=weight,
|
|
site=site,
|
|
)
|
|
|
|
if not self.module_enabled("domain"):
|
|
return
|
|
|
|
now = int(time.time())
|
|
bucket_ts = now - (now % self.bucket_seconds)
|
|
key = (bucket_ts, site)
|
|
|
|
with self.lock:
|
|
b = self.domain_window[key]
|
|
|
|
b["total"] += weight
|
|
b[f"cache_{cache}"] += weight
|
|
|
|
if cache in ("hit", "miss"):
|
|
b["cacheable"] += weight
|
|
|
|
if cache in ("miss", "pass"):
|
|
b["backend"] += weight
|
|
|
|
if status_class_value == "2xx":
|
|
b["2xx"] += weight
|
|
elif status_class_value == "3xx":
|
|
b["3xx"] += weight
|
|
elif status_class_value == "4xx":
|
|
b["4xx"] += weight
|
|
b["errors"] += weight
|
|
elif status_class_value == "5xx":
|
|
b["5xx"] += weight
|
|
b["errors"] += weight
|
|
|
|
if latency is not None:
|
|
b["latency_sum"] += latency * weight
|
|
b["latency_count"] += weight
|
|
b[f"cache_{cache}_latency_sum"] += latency * weight
|
|
b[f"cache_{cache}_latency_count"] += weight
|
|
|
|
if latency > 0.05:
|
|
b["slow_50ms"] += weight
|
|
if latency > 0.1:
|
|
b["slow_100ms"] += weight
|
|
if latency > 0.25:
|
|
b["slow_250ms"] += weight
|
|
if latency > 0.5:
|
|
b["slow_500ms"] += weight
|
|
if latency > 1.0:
|
|
b["slow_1s"] += weight
|
|
if latency > 2.5:
|
|
b["slow_2500ms"] += weight
|
|
if latency > 5.0:
|
|
b["slow_5s"] += weight
|
|
|
|
for le in self.buckets:
|
|
if latency <= le:
|
|
b[f"latency_le_{le}"] += weight
|
|
|
|
b["latency_le_inf"] += weight
|
|
|
|
self.cleanup_window_locked(now)
|
|
|
|
def cleanup_window_locked(self, now):
|
|
min_ts = now - self.window_seconds - self.bucket_seconds
|
|
|
|
for key in list(self.domain_window.keys()):
|
|
bucket_ts, _site = key
|
|
|
|
if bucket_ts < min_ts:
|
|
del self.domain_window[key]
|
|
|
|
def calculate_domain_window_stats(self):
|
|
if not self.module_enabled("domain"):
|
|
return {}
|
|
|
|
now = int(time.time())
|
|
min_ts = now - self.window_seconds
|
|
|
|
stats = defaultdict(lambda: defaultdict(float))
|
|
|
|
with self.lock:
|
|
self.cleanup_window_locked(now)
|
|
items = list(self.domain_window.items())
|
|
|
|
for (bucket_ts, site), values in items:
|
|
if bucket_ts < min_ts:
|
|
continue
|
|
|
|
s = stats[site]
|
|
|
|
for k, v in values.items():
|
|
s[k] += float(v)
|
|
|
|
out = {}
|
|
|
|
for site, s in stats.items():
|
|
total = s.get("total", 0.0)
|
|
|
|
if total <= 0:
|
|
continue
|
|
|
|
labels = (("site", site),)
|
|
window = float(self.window_seconds)
|
|
|
|
hit = s.get("cache_hit", 0.0)
|
|
miss = s.get("cache_miss", 0.0)
|
|
passed = s.get("cache_pass", 0.0)
|
|
pipe = s.get("cache_pipe", 0.0)
|
|
synth = s.get("cache_synth", 0.0)
|
|
unknown_cache = s.get("cache_unknown", 0.0)
|
|
|
|
backend = s.get("backend", 0.0)
|
|
cacheable = s.get("cacheable", 0.0)
|
|
|
|
e2xx = s.get("2xx", 0.0)
|
|
e3xx = s.get("3xx", 0.0)
|
|
e4xx = s.get("4xx", 0.0)
|
|
e5xx = s.get("5xx", 0.0)
|
|
errors = s.get("errors", 0.0)
|
|
|
|
out[("varnish_domain_rps", labels)] = total / window
|
|
out[("varnish_domain_hit_ratio", labels)] = hit / total
|
|
out[("varnish_domain_backend_ratio", labels)] = backend / total
|
|
out[("varnish_domain_5xx_ratio", labels)] = e5xx / total
|
|
|
|
latency_count = s.get("latency_count", 0.0)
|
|
|
|
latency_buckets = None
|
|
|
|
if latency_count > 0:
|
|
latency_buckets = {}
|
|
|
|
for le in self.buckets:
|
|
latency_buckets[le] = s.get(f"latency_le_{le}", 0.0)
|
|
|
|
latency_buckets[float("inf")] = s.get("latency_le_inf", 0.0)
|
|
|
|
out[("varnish_domain_p95_latency_seconds", labels)] = estimate_quantile_from_buckets(
|
|
latency_buckets,
|
|
latency_count,
|
|
0.95,
|
|
)
|
|
|
|
if self.emit_standard():
|
|
out[("varnish_domain_requests_per_second", labels)] = total / window
|
|
out[("varnish_domain_hit_rps", labels)] = hit / window
|
|
out[("varnish_domain_miss_rps", labels)] = miss / window
|
|
out[("varnish_domain_pass_rps", labels)] = passed / window
|
|
out[("varnish_domain_backend_rps", labels)] = backend / window
|
|
out[("varnish_domain_saved_backend_rps", labels)] = hit / window
|
|
|
|
out[("varnish_domain_miss_ratio", labels)] = miss / total
|
|
out[("varnish_domain_pass_ratio", labels)] = passed / total
|
|
out[("varnish_domain_backend_ratio", labels)] = backend / total
|
|
out[("varnish_domain_cacheable_ratio", labels)] = cacheable / total
|
|
out[("varnish_domain_4xx_ratio", labels)] = e4xx / total
|
|
out[("varnish_domain_error_ratio", labels)] = errors / total
|
|
|
|
if cacheable > 0:
|
|
out[("varnish_domain_cache_efficiency_ratio", labels)] = hit / cacheable
|
|
else:
|
|
out[("varnish_domain_cache_efficiency_ratio", labels)] = 0.0
|
|
|
|
if latency_count > 0 and latency_buckets is not None:
|
|
out[("varnish_domain_avg_latency_seconds", labels)] = s.get("latency_sum", 0.0) / latency_count
|
|
out[("varnish_domain_latency_observed_ratio", labels)] = latency_count / total
|
|
|
|
out[("varnish_domain_p50_latency_seconds", labels)] = estimate_quantile_from_buckets(
|
|
latency_buckets,
|
|
latency_count,
|
|
0.50,
|
|
)
|
|
out[("varnish_domain_p90_latency_seconds", labels)] = estimate_quantile_from_buckets(
|
|
latency_buckets,
|
|
latency_count,
|
|
0.90,
|
|
)
|
|
out[("varnish_domain_p99_latency_seconds", labels)] = estimate_quantile_from_buckets(
|
|
latency_buckets,
|
|
latency_count,
|
|
0.99,
|
|
)
|
|
|
|
out[("varnish_domain_slow_100ms_ratio", labels)] = s.get("slow_100ms", 0.0) / latency_count
|
|
out[("varnish_domain_slow_250ms_ratio", labels)] = s.get("slow_250ms", 0.0) / latency_count
|
|
out[("varnish_domain_slow_500ms_ratio", labels)] = s.get("slow_500ms", 0.0) / latency_count
|
|
out[("varnish_domain_slow_1s_ratio", labels)] = s.get("slow_1s", 0.0) / latency_count
|
|
|
|
if self.emit_full():
|
|
out[("varnish_domain_pipe_rps", labels)] = pipe / window
|
|
out[("varnish_domain_synth_rps", labels)] = synth / window
|
|
out[("varnish_domain_unknown_cache_rps", labels)] = unknown_cache / window
|
|
|
|
out[("varnish_domain_pipe_ratio", labels)] = pipe / total
|
|
out[("varnish_domain_synth_ratio", labels)] = synth / total
|
|
out[("varnish_domain_unknown_cache_ratio", labels)] = unknown_cache / total
|
|
|
|
out[("varnish_domain_2xx_ratio", labels)] = e2xx / total
|
|
out[("varnish_domain_3xx_ratio", labels)] = e3xx / total
|
|
out[("varnish_domain_2xx_rps", labels)] = e2xx / window
|
|
out[("varnish_domain_3xx_rps", labels)] = e3xx / window
|
|
out[("varnish_domain_4xx_rps", labels)] = e4xx / window
|
|
out[("varnish_domain_5xx_rps", labels)] = e5xx / window
|
|
out[("varnish_domain_error_rps", labels)] = errors / window
|
|
|
|
if latency_count > 0:
|
|
out[("varnish_domain_slow_50ms_ratio", labels)] = s.get("slow_50ms", 0.0) / latency_count
|
|
out[("varnish_domain_slow_2500ms_ratio", labels)] = s.get("slow_2500ms", 0.0) / latency_count
|
|
out[("varnish_domain_slow_5s_ratio", labels)] = s.get("slow_5s", 0.0) / latency_count
|
|
|
|
for cache_name in ("hit", "miss", "pass", "pipe", "synth", "unknown"):
|
|
cache_latency_count = s.get(f"cache_{cache_name}_latency_count", 0.0)
|
|
cache_latency_sum = s.get(f"cache_{cache_name}_latency_sum", 0.0)
|
|
|
|
if cache_latency_count > 0:
|
|
metric = f"varnish_domain_{cache_name}_avg_latency_seconds"
|
|
out[(metric, labels)] = cache_latency_sum / cache_latency_count
|
|
|
|
return out
|
|
|
|
def render(self):
|
|
with self.lock:
|
|
gauges = dict(self.gauges)
|
|
counters = dict(self.counters)
|
|
histograms = {
|
|
k: {
|
|
"buckets": dict(v["buckets"]),
|
|
"sum": v["sum"],
|
|
"count": v["count"],
|
|
}
|
|
for k, v in self.histograms.items()
|
|
}
|
|
series_count = len(self.series_seen)
|
|
dropped_series = self.dropped_series
|
|
|
|
derived = self.calculate_domain_window_stats()
|
|
|
|
for key, value in derived.items():
|
|
gauges[key] = value
|
|
|
|
out = []
|
|
|
|
if self.module_enabled("core"):
|
|
out.append("# TYPE varnish_exporter_series gauge")
|
|
out.append(f"varnish_exporter_series {series_count}")
|
|
|
|
out.append("# TYPE varnish_exporter_dropped_series_total counter")
|
|
out.append(f"varnish_exporter_dropped_series_total {dropped_series}")
|
|
|
|
out.append("# TYPE varnish_exporter_window_seconds gauge")
|
|
out.append(f"varnish_exporter_window_seconds {self.window_seconds}")
|
|
|
|
out.append("# TYPE varnish_exporter_profile gauge")
|
|
out.append(f'varnish_exporter_profile{{profile="{self.profile}"}} 1')
|
|
|
|
for module in sorted(self.modules):
|
|
out.append(f'varnish_exporter_module_enabled{{module="{module}"}} 1')
|
|
|
|
typed = set()
|
|
|
|
for (name, labels), value in sorted(gauges.items()):
|
|
if name not in typed:
|
|
out.append(f"# TYPE {name} gauge")
|
|
typed.add(name)
|
|
out.append(format_metric(name, labels, value))
|
|
|
|
for (name, labels), value in sorted(counters.items()):
|
|
if name not in typed:
|
|
out.append(f"# TYPE {name} counter")
|
|
typed.add(name)
|
|
out.append(format_metric(name, labels, value))
|
|
|
|
for (name, labels), h in sorted(histograms.items()):
|
|
if name not in typed:
|
|
out.append(f"# TYPE {name} histogram")
|
|
typed.add(name)
|
|
|
|
base_labels = dict(labels)
|
|
|
|
for b in self.buckets:
|
|
lb = dict(base_labels)
|
|
lb["le"] = str(b)
|
|
out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(b, 0.0)))
|
|
|
|
lb = dict(base_labels)
|
|
lb["le"] = "+Inf"
|
|
out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(float("inf"), 0.0)))
|
|
out.append(format_metric(name + "_sum", labels, h["sum"]))
|
|
out.append(format_metric(name + "_count", labels, h["count"]))
|
|
|
|
return "\n".join(out) + "\n"
|
|
|
|
|
|
class VarnishStatCollector(threading.Thread):
|
|
def __init__(self, metrics, interval, instance):
|
|
super().__init__(daemon=True)
|
|
self.metrics = metrics
|
|
self.interval = interval
|
|
self.instance = instance
|
|
|
|
def run(self):
|
|
while True:
|
|
started = time.time()
|
|
|
|
try:
|
|
self.collect()
|
|
self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishstat")
|
|
except Exception:
|
|
self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat")
|
|
|
|
self.metrics.set_gauge(
|
|
"varnish_exporter_collector_duration_seconds",
|
|
time.time() - started,
|
|
collector="varnishstat",
|
|
)
|
|
|
|
time.sleep(self.interval)
|
|
|
|
def collect(self):
|
|
cmd = ["varnishstat", "-1", "-j"]
|
|
|
|
if self.instance:
|
|
cmd.extend(["-n", self.instance])
|
|
|
|
raw = subprocess.check_output(cmd, text=True, timeout=10)
|
|
data = json.loads(raw)
|
|
|
|
for key, item in data.items():
|
|
if not isinstance(item, dict) or "value" not in item:
|
|
continue
|
|
|
|
value = item.get("value", 0)
|
|
flag = item.get("flag", "g")
|
|
metric, labels = self.metric_from_key(key)
|
|
|
|
if flag == "c":
|
|
self.metrics.set_counter(metric, value, **labels)
|
|
else:
|
|
self.metrics.set_gauge(metric, value, **labels)
|
|
|
|
def metric_from_key(self, key):
|
|
parts = key.split(".")
|
|
section = prom_name(parts[0])
|
|
|
|
if len(parts) == 2:
|
|
return f"varnish_{section}_{prom_name(parts[1])}", {}
|
|
|
|
labeled_sections = {
|
|
"VBE": "backend",
|
|
"SMA": "storage",
|
|
"SMF": "storage",
|
|
"MSE": "storage",
|
|
"LCK": "lock",
|
|
}
|
|
|
|
if parts[0] in labeled_sections and len(parts) >= 3:
|
|
label_name = labeled_sections[parts[0]]
|
|
object_name = ".".join(parts[1:-1])
|
|
field = parts[-1]
|
|
return f"varnish_{section}_{prom_name(field)}", {label_name: object_name}
|
|
|
|
return f"varnish_{section}_{prom_name('_'.join(parts[1:]))}", {}
|
|
|
|
|
|
class VarnishLogCollector(threading.Thread):
|
|
def __init__(self, metrics, cfg, instance, sample_rate):
|
|
super().__init__(daemon=True)
|
|
self.metrics = metrics
|
|
self.cfg = cfg
|
|
self.instance = instance
|
|
self.sample_rate = sample_rate
|
|
self.sample_weight = 1.0 / sample_rate if sample_rate > 0 else 0.0
|
|
|
|
def run(self):
|
|
while True:
|
|
try:
|
|
self.stream()
|
|
except Exception:
|
|
self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog")
|
|
time.sleep(2)
|
|
|
|
def stream(self):
|
|
cmd = [
|
|
"varnishlog",
|
|
"-g", "request",
|
|
"-i", "ReqMethod,ReqHeader,RespStatus,VCL_call,Timestamp,End",
|
|
]
|
|
|
|
if self.instance:
|
|
cmd.extend(["-n", self.instance])
|
|
|
|
proc = subprocess.Popen(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.DEVNULL,
|
|
text=True,
|
|
bufsize=1,
|
|
)
|
|
|
|
self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishlog")
|
|
|
|
tx = None
|
|
sampled = False
|
|
|
|
for line in proc.stdout:
|
|
line = line.rstrip("\n")
|
|
|
|
if "<< Request" in line:
|
|
if tx:
|
|
self.finish_tx(tx)
|
|
|
|
sampled = random.random() < self.sample_rate
|
|
tx = {} if sampled else None
|
|
continue
|
|
|
|
if not sampled or tx is None:
|
|
continue
|
|
|
|
parsed = self.parse_line(line)
|
|
|
|
if not parsed:
|
|
continue
|
|
|
|
tag, value = parsed
|
|
|
|
if tag == "End":
|
|
self.finish_tx(tx)
|
|
tx = None
|
|
sampled = False
|
|
continue
|
|
|
|
if tag == "ReqMethod":
|
|
tx["method"] = value.split()[0] if value else "UNKNOWN"
|
|
|
|
elif tag == "RespStatus":
|
|
tx["status"] = value.split()[0] if value else "0"
|
|
|
|
elif tag == "ReqHeader":
|
|
low = value.lower()
|
|
|
|
if low.startswith("host:"):
|
|
tx["host"] = value.split(":", 1)[1].strip()
|
|
|
|
elif tag == "VCL_call":
|
|
cache = self.cache_state(value)
|
|
|
|
if cache:
|
|
tx["cache"] = cache
|
|
|
|
elif tag == "Timestamp":
|
|
name, latency = self.parse_timestamp(value)
|
|
|
|
if latency is not None:
|
|
if name == "Resp":
|
|
tx["response_time"] = latency
|
|
elif name == "Fetch":
|
|
tx["backend_time"] = latency
|
|
else:
|
|
tx["fallback_time"] = max(tx.get("fallback_time", 0.0), latency)
|
|
|
|
if tx:
|
|
self.finish_tx(tx)
|
|
|
|
def parse_line(self, line):
|
|
m = re.match(r"^\s*-\s+([A-Za-z0-9_]+)(?:\s+(.*))?$", line)
|
|
|
|
if not m:
|
|
return None
|
|
|
|
tag = m.group(1)
|
|
value = (m.group(2) or "").strip()
|
|
return tag, value
|
|
|
|
def cache_state(self, value):
|
|
value = value.upper().strip()
|
|
|
|
if value in {"HIT", "MISS", "PASS", "PIPE", "SYNTH"}:
|
|
return value.lower()
|
|
|
|
return None
|
|
|
|
def parse_timestamp(self, value):
|
|
m = re.match(r"^([A-Za-z_]+):\s+\d+\.\d+\s+([0-9.]+)", value)
|
|
|
|
if not m:
|
|
return None, None
|
|
|
|
try:
|
|
return m.group(1), float(m.group(2))
|
|
except ValueError:
|
|
return None, None
|
|
|
|
def finish_tx(self, tx):
|
|
if not tx:
|
|
return
|
|
|
|
method = normalize_method(tx.get("method", "UNKNOWN"), self.cfg)
|
|
status = status_class(tx.get("status", "0"))
|
|
cache = tx.get("cache", "unknown")
|
|
site = site_group(tx.get("host", ""), self.cfg)
|
|
latency = tx.get("response_time", tx.get("fallback_time"))
|
|
|
|
self.metrics.record_http(
|
|
site=site,
|
|
method=method,
|
|
status_class_value=status,
|
|
cache=cache,
|
|
latency=latency,
|
|
weight=self.sample_weight,
|
|
)
|
|
|
|
|
|
class Handler(BaseHTTPRequestHandler):
|
|
metrics = None
|
|
|
|
def do_GET(self):
|
|
if self.path != "/metrics":
|
|
self.send_response(404)
|
|
self.end_headers()
|
|
return
|
|
|
|
started = time.time()
|
|
body_text = self.metrics.render()
|
|
duration = time.time() - started
|
|
|
|
self.metrics.set_gauge("varnish_exporter_render_duration_seconds", duration)
|
|
|
|
body = body_text.encode("utf-8")
|
|
|
|
self.send_response(200)
|
|
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
|
self.send_header("Content-Length", str(len(body)))
|
|
self.end_headers()
|
|
self.wfile.write(body)
|
|
|
|
def log_message(self, fmt, *args):
|
|
return
|
|
|
|
|
|
def build_parser():
|
|
epilog = """
|
|
Examples:
|
|
|
|
Only varnishstat:
|
|
python3 varnish_exporter.py --modules core,stat
|
|
|
|
Test VSL without sampling:
|
|
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full
|
|
|
|
Recommended production mode:
|
|
sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard
|
|
|
|
Debug with raw metrics:
|
|
sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1
|
|
|
|
With domain config:
|
|
sudo python3 varnish_exporter.py --config /etc/varnish-exporter/config.json --enable-vsl
|
|
|
|
Modules:
|
|
core exporter self metrics
|
|
stat varnishstat -1 -j
|
|
vsl varnishlog -g request
|
|
domain derived per-domain statistics
|
|
raw raw request counters/histograms
|
|
|
|
Profiles:
|
|
minimal small set: rps, hit ratio, backend ratio, 5xx, p95
|
|
standard recommended: business-oriented domain statistics
|
|
full more detailed statistics
|
|
raw full + raw HTTP metrics
|
|
"""
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Varnish Prometheus Business Exporter - per-domain statistics from varnishstat and varnishlog.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=epilog,
|
|
)
|
|
|
|
parser.add_argument("--listen", default="0.0.0.0", help="HTTP listen address. Default: 0.0.0.0")
|
|
parser.add_argument("--port", type=int, default=9131, help="HTTP /metrics port. Default: 9131")
|
|
parser.add_argument("--instance", default="", help="Varnish instance for -n. Usually empty.")
|
|
parser.add_argument("--config", default="", help="Path to config.json with domain/site rules.")
|
|
parser.add_argument("--stat-interval", type=int, default=5, help="varnishstat interval in seconds. Default: 5")
|
|
|
|
parser.add_argument(
|
|
"--modules",
|
|
type=parse_modules,
|
|
default=parse_modules("core,stat,vsl,domain"),
|
|
help="Modules to enable: core,stat,vsl,domain,raw or all. Default: core,stat,vsl,domain",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--profile",
|
|
choices=["minimal", "standard", "full", "raw"],
|
|
default="standard",
|
|
help="Domain metric detail level. Default: standard",
|
|
)
|
|
|
|
parser.add_argument("--enable-vsl", action="store_true", help="Enable varnishlog/VSL collector.")
|
|
parser.add_argument("--enable-varnishlog", action="store_true", help="Alias for --enable-vsl.")
|
|
parser.add_argument("--vsl-sample", type=float, default=0.001, help="VSL sampling: 1=100%%, 0.001=0.1%%. Default: 0.001")
|
|
parser.add_argument("--max-series", type=int, default=10000, help="Maximum number of series in the exporter. Default: 10000")
|
|
parser.add_argument("--window-seconds", type=int, default=60, help="Window for domain statistics. Default: 60")
|
|
parser.add_argument("--bucket-seconds", type=int, default=5, help="Internal bucket size for the domain window. Default: 5")
|
|
|
|
return parser
|
|
|
|
|
|
def main():
|
|
parser = build_parser()
|
|
args = parser.parse_args()
|
|
|
|
modules = set(args.modules)
|
|
|
|
if args.enable_vsl or args.enable_varnishlog:
|
|
modules.add("vsl")
|
|
|
|
if "raw" in modules and args.profile != "raw":
|
|
args.profile = "raw"
|
|
|
|
if args.vsl_sample <= 0 or args.vsl_sample > 1:
|
|
raise SystemExit("--vsl-sample must be in range 0 < x <= 1")
|
|
|
|
if args.window_seconds < 10:
|
|
raise SystemExit("--window-seconds must be >= 10")
|
|
|
|
if args.bucket_seconds < 1:
|
|
raise SystemExit("--bucket-seconds must be >= 1")
|
|
|
|
cfg = load_config(args.config)
|
|
|
|
metrics = Metrics(
|
|
buckets=cfg["histogram_buckets"],
|
|
max_series=args.max_series,
|
|
window_seconds=args.window_seconds,
|
|
bucket_seconds=args.bucket_seconds,
|
|
profile=args.profile,
|
|
modules=modules,
|
|
)
|
|
|
|
if "stat" in modules:
|
|
VarnishStatCollector(
|
|
metrics=metrics,
|
|
interval=args.stat_interval,
|
|
instance=args.instance,
|
|
).start()
|
|
else:
|
|
metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat")
|
|
|
|
if "vsl" in modules:
|
|
VarnishLogCollector(
|
|
metrics=metrics,
|
|
cfg=cfg,
|
|
instance=args.instance,
|
|
sample_rate=args.vsl_sample,
|
|
).start()
|
|
else:
|
|
metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog")
|
|
|
|
Handler.metrics = metrics
|
|
|
|
server = ThreadingHTTPServer((args.listen, args.port), Handler)
|
|
print(f"listening on http://{args.listen}:{args.port}/metrics")
|
|
print(f"profile={args.profile}")
|
|
print(f"modules={','.join(sorted(modules))}")
|
|
server.serve_forever()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|