varnish_exporter/varnish_exporter.py

#!/usr/bin/env python3
"""
Varnish Prometheus Business Exporter

Purpose:
  A lightweight Prometheus exporter for Varnish focused not only on raw counters,
  but also on ready-to-use business and operational statistics per domain/site.

Data sources:
  1. varnishstat -1 -j
     - global Varnish metrics
     - storage, backends, cache hits/misses, workers, locks, etc.

  2. varnishlog -g request
     - sampled request stream
     - per-domain aggregation
     - hit/miss/pass/backend ratios
     - latency p50/p90/p95/p99
     - error ratios
     - RPS
     - saved backend RPS

Main domain metrics:
  varnish_domain_rps
  varnish_domain_hit_ratio
  varnish_domain_miss_ratio
  varnish_domain_pass_ratio
  varnish_domain_backend_ratio
  varnish_domain_cache_efficiency_ratio
  varnish_domain_saved_backend_rps
  varnish_domain_error_ratio
  varnish_domain_4xx_ratio
  varnish_domain_5xx_ratio
  varnish_domain_avg_latency_seconds
  varnish_domain_p50_latency_seconds
  varnish_domain_p90_latency_seconds
  varnish_domain_p95_latency_seconds
  varnish_domain_p99_latency_seconds
  varnish_domain_slow_100ms_ratio
  varnish_domain_slow_250ms_ratio
  varnish_domain_slow_500ms_ratio
  varnish_domain_slow_1s_ratio

Profiles:
  minimal:
    - exporter health
    - varnishstat
    - per-domain: rps, hit_ratio, backend_ratio, 5xx_ratio, p95

  standard:
    - minimal +
    - miss/pass ratio
    - cache efficiency
    - backend rps
    - saved backend rps
    - error ratio
    - avg/p50/p90/p99 latency
    - slow ratios

  full:
    - standard +
    - per-cache average latency
    - 2xx/3xx/4xx/5xx rps/ratio
    - pipe/synth/unknown ratios

  raw:
    - full +
    - raw varnish_http_requests_total
    - raw varnish_http_request_duration_seconds
    - varnish_domain_response_time_seconds histogram

Modules:
  core:
    - exporter self metrics

  stat:
    - varnishstat metrics

  vsl:
    - varnishlog collector

  domain:
    - derived per-domain metrics

  raw:
    - raw HTTP request counters/histograms

Defaults:
  --modules core,stat,vsl,domain
  --profile standard
  --vsl-sample 0.001

Examples:

  Local test without sampling:
    sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full

  Production:
    sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard

  Only varnishstat:
    python3 varnish_exporter.py --modules core,stat

  Only domain aggregates, no raw HTTP metrics:
    sudo python3 varnish_exporter.py --modules core,stat,vsl,domain --profile standard

  Debug with raw metrics:
    sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1

JSON config example:
  {
    "site_rules": [
      {
        "match": "(^|\\\\.)example\\\\.com$",
        "site": "example_com"
      },
      {
        "match": "(^|\\\\.)static\\\\.example\\\\.com$",
        "site": "static_example_com"
      }
    ],
    "default_site": "other",
    "allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
    "histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
  }

Notes:
  - Varnish cannot see real browser render time.
  - Latency here means server-side response time as seen by Varnish.
  - For very high traffic, do not use --vsl-sample 1 in production.
  - Sensible production values: 0.001 or 0.0001.
"""

import argparse
import json
import random
import re
import subprocess
import threading
import time
from collections import defaultdict
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer


DEFAULT_CONFIG = {
    "site_rules": [],
    "default_site": "other",
    "allowed_methods": ["GET", "HEAD", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
    "histogram_buckets": [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
}


VALID_MODULES = {"core", "stat", "vsl", "domain", "raw"}


def load_config(path):
    if not path:
        cfg = dict(DEFAULT_CONFIG)
    else:
        with open(path, "r", encoding="utf-8") as f:
            cfg = json.load(f)

    cfg.setdefault("site_rules", [])
    cfg.setdefault("default_site", "other")
    cfg.setdefault("allowed_methods", DEFAULT_CONFIG["allowed_methods"])
    cfg.setdefault("histogram_buckets", DEFAULT_CONFIG["histogram_buckets"])

    cfg["site_rules_compiled"] = [
        (re.compile(x["match"], re.I), x["site"])
        for x in cfg.get("site_rules", [])
    ]
    cfg["allowed_methods_set"] = set(cfg["allowed_methods"])
    return cfg


def parse_modules(value):
    modules = set()

    for item in value.split(","):
        item = item.strip().lower()

        if not item:
            continue

        if item == "all":
            return set(VALID_MODULES)

        if item not in VALID_MODULES:
            raise argparse.ArgumentTypeError(
                f"unknown module: {item}; available modules: {','.join(sorted(VALID_MODULES))}"
            )

        modules.add(item)

    if not modules:
        raise argparse.ArgumentTypeError("module list cannot be empty")

    return modules


def prom_escape(value):
    return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")


def prom_name(name):
    name = str(name).lower()
    name = re.sub(r"[^a-z0-9_]", "_", name)
    name = re.sub(r"_+", "_", name).strip("_")
    if name and name[0].isdigit():
        name = "_" + name
    return name or "unknown"


def status_class(status):
    try:
        code = int(status)
    except Exception:
        return "unknown"

    if 100 <= code <= 599:
        return f"{code // 100}xx"

    return "unknown"


def normalize_method(method, cfg):
    method = str(method or "UNKNOWN").upper()
    return method if method in cfg["allowed_methods_set"] else "OTHER"


def normalize_host(host):
    host = str(host or "").lower().strip().rstrip(".")

    if not host:
        return ""

    if host.startswith("[") and "]" in host:
        return host[1:host.index("]")]

    if ":" in host:
        host = host.split(":", 1)[0]

    return host


def site_group(host, cfg):
    host = normalize_host(host)

    if not host:
        return cfg.get("default_site", "other")

    for regex, site in cfg["site_rules_compiled"]:
        if regex.search(host):
            return site

    return host


def format_metric(name, labels, value):
    if labels:
        label_str = ",".join(
            f'{k}="{prom_escape(v)}"'
            for k, v in sorted(labels)
        )
        return f"{name}{{{label_str}}} {value}"

    return f"{name} {value}"


def estimate_quantile_from_buckets(buckets, count, q):
    if count <= 0:
        return 0.0

    finite = sorted(b for b in buckets.keys() if b != float("inf"))

    if not finite:
        return 0.0

    target = count * q
    prev_le = 0.0
    prev_count = 0.0

    for le in finite:
        current_count = buckets.get(le, 0.0)

        if current_count >= target:
            if current_count <= prev_count:
                return le

            fraction = (target - prev_count) / (current_count - prev_count)
            return prev_le + ((le - prev_le) * fraction)

        prev_le = le
        prev_count = current_count

    return finite[-1]


class Metrics:
    def __init__(self, buckets, max_series, window_seconds, bucket_seconds, profile, modules):
        self.lock = threading.RLock()
        self.buckets = buckets
        self.max_series = max_series
        self.window_seconds = window_seconds
        self.bucket_seconds = bucket_seconds
        self.profile = profile
        self.modules = modules

        self.gauges = {}
        self.counters = {}
        self.histograms = defaultdict(lambda: {
            "buckets": defaultdict(float),
            "sum": 0.0,
            "count": 0.0,
        })

        self.series_seen = set()
        self.dropped_series = 0

        self.domain_window = defaultdict(lambda: defaultdict(float))

    def module_enabled(self, name):
        return name in self.modules

    def emit_raw(self):
        return self.profile == "raw" or self.module_enabled("raw")

    def emit_minimal(self):
        return self.profile in ("minimal", "standard", "full", "raw")

    def emit_standard(self):
        return self.profile in ("standard", "full", "raw")

    def emit_full(self):
        return self.profile in ("full", "raw")

    def _label_key(self, labels):
        return tuple(sorted((str(k), str(v)) for k, v in labels.items()))

    def _allow_series(self, metric, labels):
        key = (metric, labels)

        if key in self.series_seen:
            return True

        if len(self.series_seen) >= self.max_series:
            self.dropped_series += 1
            return False

        self.series_seen.add(key)
        return True

    def set_gauge(self, name, value, **labels):
        labels_key = self._label_key(labels)

        with self.lock:
            if not self._allow_series(name, labels_key):
                return
            self.gauges[(name, labels_key)] = float(value)

    def set_counter(self, name, value, **labels):
        if not name.endswith("_total"):
            name += "_total"

        labels_key = self._label_key(labels)

        with self.lock:
            if not self._allow_series(name, labels_key):
                return
            self.counters[(name, labels_key)] = float(value)

    def inc_counter(self, name, amount=1.0, **labels):
        if not name.endswith("_total"):
            name += "_total"

        labels_key = self._label_key(labels)

        with self.lock:
            if not self._allow_series(name, labels_key):
                return
            self.counters[(name, labels_key)] = self.counters.get((name, labels_key), 0.0) + float(amount)

    def observe(self, name, value, weight=1.0, **labels):
        labels_key = self._label_key(labels)

        with self.lock:
            if not self._allow_series(name, labels_key):
                return

            h = self.histograms[(name, labels_key)]
            h["sum"] += float(value) * float(weight)
            h["count"] += float(weight)

            for b in self.buckets:
                if value <= b:
                    h["buckets"][b] += float(weight)

            h["buckets"][float("inf")] += float(weight)

    def record_http(self, site, method, status_class_value, cache, latency, weight):
        raw_labels = {
            "site": site,
            "method": method,
            "status_class": status_class_value,
            "cache": cache,
        }

        if self.emit_raw():
            self.inc_counter("varnish_http_requests_total", weight, **raw_labels)

            if latency is not None:
                self.observe(
                    "varnish_http_request_duration_seconds",
                    latency,
                    weight=weight,
                    **raw_labels,
                )

                self.observe(
                    "varnish_domain_response_time_seconds",
                    latency,
                    weight=weight,
                    site=site,
                )

        if not self.module_enabled("domain"):
            return

        now = int(time.time())
        bucket_ts = now - (now % self.bucket_seconds)
        key = (bucket_ts, site)

        with self.lock:
            b = self.domain_window[key]

            b["total"] += weight
            b[f"cache_{cache}"] += weight

            if cache in ("hit", "miss"):
                b["cacheable"] += weight

            if cache in ("miss", "pass"):
                b["backend"] += weight

            if status_class_value == "2xx":
                b["2xx"] += weight
            elif status_class_value == "3xx":
                b["3xx"] += weight
            elif status_class_value == "4xx":
                b["4xx"] += weight
                b["errors"] += weight
            elif status_class_value == "5xx":
                b["5xx"] += weight
                b["errors"] += weight

            if latency is not None:
                b["latency_sum"] += latency * weight
                b["latency_count"] += weight
                b[f"cache_{cache}_latency_sum"] += latency * weight
                b[f"cache_{cache}_latency_count"] += weight

                if latency > 0.05:
                    b["slow_50ms"] += weight
                if latency > 0.1:
                    b["slow_100ms"] += weight
                if latency > 0.25:
                    b["slow_250ms"] += weight
                if latency > 0.5:
                    b["slow_500ms"] += weight
                if latency > 1.0:
                    b["slow_1s"] += weight
                if latency > 2.5:
                    b["slow_2500ms"] += weight
                if latency > 5.0:
                    b["slow_5s"] += weight

                for le in self.buckets:
                    if latency <= le:
                        b[f"latency_le_{le}"] += weight

                b["latency_le_inf"] += weight

            self.cleanup_window_locked(now)

    def cleanup_window_locked(self, now):
        min_ts = now - self.window_seconds - self.bucket_seconds

        for key in list(self.domain_window.keys()):
            bucket_ts, _site = key

            if bucket_ts < min_ts:
                del self.domain_window[key]

    def calculate_domain_window_stats(self):
        if not self.module_enabled("domain"):
            return {}

        now = int(time.time())
        min_ts = now - self.window_seconds

        stats = defaultdict(lambda: defaultdict(float))

        with self.lock:
            self.cleanup_window_locked(now)
            items = list(self.domain_window.items())

        for (bucket_ts, site), values in items:
            if bucket_ts < min_ts:
                continue

            s = stats[site]

            for k, v in values.items():
                s[k] += float(v)

        out = {}

        for site, s in stats.items():
            total = s.get("total", 0.0)

            if total <= 0:
                continue

            labels = (("site", site),)
            window = float(self.window_seconds)

            hit = s.get("cache_hit", 0.0)
            miss = s.get("cache_miss", 0.0)
            passed = s.get("cache_pass", 0.0)
            pipe = s.get("cache_pipe", 0.0)
            synth = s.get("cache_synth", 0.0)
            unknown_cache = s.get("cache_unknown", 0.0)

            backend = s.get("backend", 0.0)
            cacheable = s.get("cacheable", 0.0)

            e2xx = s.get("2xx", 0.0)
            e3xx = s.get("3xx", 0.0)
            e4xx = s.get("4xx", 0.0)
            e5xx = s.get("5xx", 0.0)
            errors = s.get("errors", 0.0)

            out[("varnish_domain_rps", labels)] = total / window
            out[("varnish_domain_hit_ratio", labels)] = hit / total
            out[("varnish_domain_backend_ratio", labels)] = backend / total
            out[("varnish_domain_5xx_ratio", labels)] = e5xx / total

            latency_count = s.get("latency_count", 0.0)

            latency_buckets = None

            if latency_count > 0:
                latency_buckets = {}

                for le in self.buckets:
                    latency_buckets[le] = s.get(f"latency_le_{le}", 0.0)

                latency_buckets[float("inf")] = s.get("latency_le_inf", 0.0)

                out[("varnish_domain_p95_latency_seconds", labels)] = estimate_quantile_from_buckets(
                    latency_buckets,
                    latency_count,
                    0.95,
                )

            if self.emit_standard():
                out[("varnish_domain_requests_per_second", labels)] = total / window
                out[("varnish_domain_hit_rps", labels)] = hit / window
                out[("varnish_domain_miss_rps", labels)] = miss / window
                out[("varnish_domain_pass_rps", labels)] = passed / window
                out[("varnish_domain_backend_rps", labels)] = backend / window
                out[("varnish_domain_saved_backend_rps", labels)] = hit / window

                out[("varnish_domain_miss_ratio", labels)] = miss / total
                out[("varnish_domain_pass_ratio", labels)] = passed / total
                out[("varnish_domain_backend_ratio", labels)] = backend / total
                out[("varnish_domain_cacheable_ratio", labels)] = cacheable / total
                out[("varnish_domain_4xx_ratio", labels)] = e4xx / total
                out[("varnish_domain_error_ratio", labels)] = errors / total

                if cacheable > 0:
                    out[("varnish_domain_cache_efficiency_ratio", labels)] = hit / cacheable
                else:
                    out[("varnish_domain_cache_efficiency_ratio", labels)] = 0.0

                if latency_count > 0 and latency_buckets is not None:
                    out[("varnish_domain_avg_latency_seconds", labels)] = s.get("latency_sum", 0.0) / latency_count
                    out[("varnish_domain_latency_observed_ratio", labels)] = latency_count / total

                    out[("varnish_domain_p50_latency_seconds", labels)] = estimate_quantile_from_buckets(
                        latency_buckets,
                        latency_count,
                        0.50,
                    )
                    out[("varnish_domain_p90_latency_seconds", labels)] = estimate_quantile_from_buckets(
                        latency_buckets,
                        latency_count,
                        0.90,
                    )
                    out[("varnish_domain_p99_latency_seconds", labels)] = estimate_quantile_from_buckets(
                        latency_buckets,
                        latency_count,
                        0.99,
                    )

                    out[("varnish_domain_slow_100ms_ratio", labels)] = s.get("slow_100ms", 0.0) / latency_count
                    out[("varnish_domain_slow_250ms_ratio", labels)] = s.get("slow_250ms", 0.0) / latency_count
                    out[("varnish_domain_slow_500ms_ratio", labels)] = s.get("slow_500ms", 0.0) / latency_count
                    out[("varnish_domain_slow_1s_ratio", labels)] = s.get("slow_1s", 0.0) / latency_count

            if self.emit_full():
                out[("varnish_domain_pipe_rps", labels)] = pipe / window
                out[("varnish_domain_synth_rps", labels)] = synth / window
                out[("varnish_domain_unknown_cache_rps", labels)] = unknown_cache / window

                out[("varnish_domain_pipe_ratio", labels)] = pipe / total
                out[("varnish_domain_synth_ratio", labels)] = synth / total
                out[("varnish_domain_unknown_cache_ratio", labels)] = unknown_cache / total

                out[("varnish_domain_2xx_ratio", labels)] = e2xx / total
                out[("varnish_domain_3xx_ratio", labels)] = e3xx / total
                out[("varnish_domain_2xx_rps", labels)] = e2xx / window
                out[("varnish_domain_3xx_rps", labels)] = e3xx / window
                out[("varnish_domain_4xx_rps", labels)] = e4xx / window
                out[("varnish_domain_5xx_rps", labels)] = e5xx / window
                out[("varnish_domain_error_rps", labels)] = errors / window

                if latency_count > 0:
                    out[("varnish_domain_slow_50ms_ratio", labels)] = s.get("slow_50ms", 0.0) / latency_count
                    out[("varnish_domain_slow_2500ms_ratio", labels)] = s.get("slow_2500ms", 0.0) / latency_count
                    out[("varnish_domain_slow_5s_ratio", labels)] = s.get("slow_5s", 0.0) / latency_count

                for cache_name in ("hit", "miss", "pass", "pipe", "synth", "unknown"):
                    cache_latency_count = s.get(f"cache_{cache_name}_latency_count", 0.0)
                    cache_latency_sum = s.get(f"cache_{cache_name}_latency_sum", 0.0)

                    if cache_latency_count > 0:
                        metric = f"varnish_domain_{cache_name}_avg_latency_seconds"
                        out[(metric, labels)] = cache_latency_sum / cache_latency_count

        return out

    def render(self):
        with self.lock:
            gauges = dict(self.gauges)
            counters = dict(self.counters)
            histograms = {
                k: {
                    "buckets": dict(v["buckets"]),
                    "sum": v["sum"],
                    "count": v["count"],
                }
                for k, v in self.histograms.items()
            }
            series_count = len(self.series_seen)
            dropped_series = self.dropped_series

        derived = self.calculate_domain_window_stats()

        for key, value in derived.items():
            gauges[key] = value

        out = []

        if self.module_enabled("core"):
            out.append("# TYPE varnish_exporter_series gauge")
            out.append(f"varnish_exporter_series {series_count}")

            out.append("# TYPE varnish_exporter_dropped_series_total counter")
            out.append(f"varnish_exporter_dropped_series_total {dropped_series}")

            out.append("# TYPE varnish_exporter_window_seconds gauge")
            out.append(f"varnish_exporter_window_seconds {self.window_seconds}")

            out.append("# TYPE varnish_exporter_profile gauge")
            out.append(f'varnish_exporter_profile{{profile="{self.profile}"}} 1')

            for module in sorted(self.modules):
                out.append(f'varnish_exporter_module_enabled{{module="{module}"}} 1')

        typed = set()

        for (name, labels), value in sorted(gauges.items()):
            if name not in typed:
                out.append(f"# TYPE {name} gauge")
                typed.add(name)
            out.append(format_metric(name, labels, value))

        for (name, labels), value in sorted(counters.items()):
            if name not in typed:
                out.append(f"# TYPE {name} counter")
                typed.add(name)
            out.append(format_metric(name, labels, value))

        for (name, labels), h in sorted(histograms.items()):
            if name not in typed:
                out.append(f"# TYPE {name} histogram")
                typed.add(name)

            base_labels = dict(labels)

            for b in self.buckets:
                lb = dict(base_labels)
                lb["le"] = str(b)
                out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(b, 0.0)))

            lb = dict(base_labels)
            lb["le"] = "+Inf"
            out.append(format_metric(name + "_bucket", tuple(sorted(lb.items())), h["buckets"].get(float("inf"), 0.0)))
            out.append(format_metric(name + "_sum", labels, h["sum"]))
            out.append(format_metric(name + "_count", labels, h["count"]))

        return "\n".join(out) + "\n"


class VarnishStatCollector(threading.Thread):
    def __init__(self, metrics, interval, instance):
        super().__init__(daemon=True)
        self.metrics = metrics
        self.interval = interval
        self.instance = instance

    def run(self):
        while True:
            started = time.time()

            try:
                self.collect()
                self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishstat")
            except Exception:
                self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat")

            self.metrics.set_gauge(
                "varnish_exporter_collector_duration_seconds",
                time.time() - started,
                collector="varnishstat",
            )

            time.sleep(self.interval)

    def collect(self):
        cmd = ["varnishstat", "-1", "-j"]

        if self.instance:
            cmd.extend(["-n", self.instance])

        raw = subprocess.check_output(cmd, text=True, timeout=10)
        data = json.loads(raw)

        for key, item in data.items():
            if not isinstance(item, dict) or "value" not in item:
                continue

            value = item.get("value", 0)
            flag = item.get("flag", "g")
            metric, labels = self.metric_from_key(key)

            if flag == "c":
                self.metrics.set_counter(metric, value, **labels)
            else:
                self.metrics.set_gauge(metric, value, **labels)

    def metric_from_key(self, key):
        parts = key.split(".")
        section = prom_name(parts[0])

        if len(parts) == 2:
            return f"varnish_{section}_{prom_name(parts[1])}", {}

        labeled_sections = {
            "VBE": "backend",
            "SMA": "storage",
            "SMF": "storage",
            "MSE": "storage",
            "LCK": "lock",
        }

        if parts[0] in labeled_sections and len(parts) >= 3:
            label_name = labeled_sections[parts[0]]
            object_name = ".".join(parts[1:-1])
            field = parts[-1]
            return f"varnish_{section}_{prom_name(field)}", {label_name: object_name}

        return f"varnish_{section}_{prom_name('_'.join(parts[1:]))}", {}


class VarnishLogCollector(threading.Thread):
    def __init__(self, metrics, cfg, instance, sample_rate):
        super().__init__(daemon=True)
        self.metrics = metrics
        self.cfg = cfg
        self.instance = instance
        self.sample_rate = sample_rate
        self.sample_weight = 1.0 / sample_rate if sample_rate > 0 else 0.0

    def run(self):
        while True:
            try:
                self.stream()
            except Exception:
                self.metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog")
                time.sleep(2)

    def stream(self):
        cmd = [
            "varnishlog",
            "-g", "request",
            "-i", "ReqMethod,ReqHeader,RespStatus,VCL_call,Timestamp,End",
        ]

        if self.instance:
            cmd.extend(["-n", self.instance])

        proc = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            text=True,
            bufsize=1,
        )

        self.metrics.set_gauge("varnish_exporter_collector_up", 1, collector="varnishlog")

        tx = None
        sampled = False

        for line in proc.stdout:
            line = line.rstrip("\n")

            if "<< Request" in line:
                if tx:
                    self.finish_tx(tx)

                sampled = random.random() < self.sample_rate
                tx = {} if sampled else None
                continue

            if not sampled or tx is None:
                continue

            parsed = self.parse_line(line)

            if not parsed:
                continue

            tag, value = parsed

            if tag == "End":
                self.finish_tx(tx)
                tx = None
                sampled = False
                continue

            if tag == "ReqMethod":
                tx["method"] = value.split()[0] if value else "UNKNOWN"

            elif tag == "RespStatus":
                tx["status"] = value.split()[0] if value else "0"

            elif tag == "ReqHeader":
                low = value.lower()

                if low.startswith("host:"):
                    tx["host"] = value.split(":", 1)[1].strip()

            elif tag == "VCL_call":
                cache = self.cache_state(value)

                if cache:
                    tx["cache"] = cache

            elif tag == "Timestamp":
                name, latency = self.parse_timestamp(value)

                if latency is not None:
                    if name == "Resp":
                        tx["response_time"] = latency
                    elif name == "Fetch":
                        tx["backend_time"] = latency
                    else:
                        tx["fallback_time"] = max(tx.get("fallback_time", 0.0), latency)

        if tx:
            self.finish_tx(tx)

    def parse_line(self, line):
        m = re.match(r"^\s*-\s+([A-Za-z0-9_]+)(?:\s+(.*))?$", line)

        if not m:
            return None

        tag = m.group(1)
        value = (m.group(2) or "").strip()
        return tag, value

    def cache_state(self, value):
        value = value.upper().strip()

        if value in {"HIT", "MISS", "PASS", "PIPE", "SYNTH"}:
            return value.lower()

        return None

    def parse_timestamp(self, value):
        m = re.match(r"^([A-Za-z_]+):\s+\d+\.\d+\s+([0-9.]+)", value)

        if not m:
            return None, None

        try:
            return m.group(1), float(m.group(2))
        except ValueError:
            return None, None

    def finish_tx(self, tx):
        if not tx:
            return

        method = normalize_method(tx.get("method", "UNKNOWN"), self.cfg)
        status = status_class(tx.get("status", "0"))
        cache = tx.get("cache", "unknown")
        site = site_group(tx.get("host", ""), self.cfg)
        latency = tx.get("response_time", tx.get("fallback_time"))

        self.metrics.record_http(
            site=site,
            method=method,
            status_class_value=status,
            cache=cache,
            latency=latency,
            weight=self.sample_weight,
        )


class Handler(BaseHTTPRequestHandler):
    metrics = None

    def do_GET(self):
        if self.path != "/metrics":
            self.send_response(404)
            self.end_headers()
            return

        started = time.time()
        body_text = self.metrics.render()
        duration = time.time() - started

        self.metrics.set_gauge("varnish_exporter_render_duration_seconds", duration)

        body = body_text.encode("utf-8")

        self.send_response(200)
        self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
        self.send_header("Content-Length", str(len(body)))
        self.end_headers()
        self.wfile.write(body)

    def log_message(self, fmt, *args):
        return


def build_parser():
    epilog = """
Examples:

  Only varnishstat:
    python3 varnish_exporter.py --modules core,stat

  Test VSL without sampling:
    sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 1 --profile full

  Recommended production mode:
    sudo python3 varnish_exporter.py --enable-vsl --vsl-sample 0.001 --profile standard

  Debug with raw metrics:
    sudo python3 varnish_exporter.py --modules core,stat,vsl,domain,raw --profile raw --vsl-sample 1

  With domain config:
    sudo python3 varnish_exporter.py --config /etc/varnish-exporter/config.json --enable-vsl

Modules:
  core    exporter self metrics
  stat    varnishstat -1 -j
  vsl     varnishlog -g request
  domain  derived per-domain statistics
  raw     raw request counters/histograms

Profiles:
  minimal   small set: rps, hit ratio, backend ratio, 5xx, p95
  standard  recommended: business-oriented domain statistics
  full      more detailed statistics
  raw       full + raw HTTP metrics
"""

    parser = argparse.ArgumentParser(
        description="Varnish Prometheus Business Exporter - per-domain statistics from varnishstat and varnishlog.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=epilog,
    )

    parser.add_argument("--listen", default="0.0.0.0", help="HTTP listen address. Default: 0.0.0.0")
    parser.add_argument("--port", type=int, default=9131, help="HTTP /metrics port. Default: 9131")
    parser.add_argument("--instance", default="", help="Varnish instance for -n. Usually empty.")
    parser.add_argument("--config", default="", help="Path to config.json with domain/site rules.")
    parser.add_argument("--stat-interval", type=int, default=5, help="varnishstat interval in seconds. Default: 5")

    parser.add_argument(
        "--modules",
        type=parse_modules,
        default=parse_modules("core,stat,vsl,domain"),
        help="Modules to enable: core,stat,vsl,domain,raw or all. Default: core,stat,vsl,domain",
    )

    parser.add_argument(
        "--profile",
        choices=["minimal", "standard", "full", "raw"],
        default="standard",
        help="Domain metric detail level. Default: standard",
    )

    parser.add_argument("--enable-vsl", action="store_true", help="Enable varnishlog/VSL collector.")
    parser.add_argument("--enable-varnishlog", action="store_true", help="Alias for --enable-vsl.")
    parser.add_argument("--vsl-sample", type=float, default=0.001, help="VSL sampling: 1=100%%, 0.001=0.1%%. Default: 0.001")
    parser.add_argument("--max-series", type=int, default=10000, help="Maximum number of series in the exporter. Default: 10000")
    parser.add_argument("--window-seconds", type=int, default=60, help="Window for domain statistics. Default: 60")
    parser.add_argument("--bucket-seconds", type=int, default=5, help="Internal bucket size for the domain window. Default: 5")

    return parser


def main():
    parser = build_parser()
    args = parser.parse_args()

    modules = set(args.modules)

    if args.enable_vsl or args.enable_varnishlog:
        modules.add("vsl")

    if "raw" in modules and args.profile != "raw":
        args.profile = "raw"

    if args.vsl_sample <= 0 or args.vsl_sample > 1:
        raise SystemExit("--vsl-sample must be in range 0 < x <= 1")

    if args.window_seconds < 10:
        raise SystemExit("--window-seconds must be >= 10")

    if args.bucket_seconds < 1:
        raise SystemExit("--bucket-seconds must be >= 1")

    cfg = load_config(args.config)

    metrics = Metrics(
        buckets=cfg["histogram_buckets"],
        max_series=args.max_series,
        window_seconds=args.window_seconds,
        bucket_seconds=args.bucket_seconds,
        profile=args.profile,
        modules=modules,
    )

    if "stat" in modules:
        VarnishStatCollector(
            metrics=metrics,
            interval=args.stat_interval,
            instance=args.instance,
        ).start()
    else:
        metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishstat")

    if "vsl" in modules:
        VarnishLogCollector(
            metrics=metrics,
            cfg=cfg,
            instance=args.instance,
            sample_rate=args.vsl_sample,
        ).start()
    else:
        metrics.set_gauge("varnish_exporter_collector_up", 0, collector="varnishlog")

    Handler.metrics = metrics

    server = ThreadingHTTPServer((args.listen, args.port), Handler)
    print(f"listening on http://{args.listen}:{args.port}/metrics")
    print(f"profile={args.profile}")
    print(f"modules={','.join(sorted(modules))}")
    server.serve_forever()


if __name__ == "__main__":
    main()