tools_scripts/zfs_probe.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# example: python3 zfs_probe.py --duration 10 --interval 1 --track-files

import argparse
import ast
import json
import math
import os
import re
import shutil
import signal
import statistics
import subprocess
import sys
import time
from collections import defaultdict
from datetime import datetime


def eprint(msg):
    sys.stderr.write(str(msg) + "\n")
    sys.stderr.flush()


def oprint(msg=""):
    sys.stdout.write(str(msg) + "\n")
    sys.stdout.flush()


def human_bytes(n):
    if n is None:
        return "-"
    neg = n < 0
    n = abs(float(n))
    units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]
    for unit in units:
        if n < 1024.0 or unit == units[-1]:
            s = "{0:.1f} {1}".format(n, unit) if unit != "B" else "{0} B".format(int(n))
            return "-" + s if neg else s
        n /= 1024.0


def human_ns_to_ms(n):
    if n is None:
        return "-"
    try:
        return "{0:.2f} ms".format(float(n) / 1000000.0)
    except Exception:
        return "-"


def mean_or_zero(values):
    return statistics.mean(values) if values else 0.0


def percentile(values, q):
    if not values:
        return 0.0
    if len(values) == 1:
        return float(values[0])
    values = sorted(values)
    pos = (len(values) - 1) * q
    lo = int(math.floor(pos))
    hi = int(math.ceil(pos))
    if lo == hi:
        return float(values[lo])
    frac = pos - lo
    return values[lo] * (1.0 - frac) + values[hi] * frac


def run_cmd(cmd, timeout=None):
    proc = subprocess.run(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        universal_newlines=True,
        timeout=timeout,
        check=False,
    )
    return proc.returncode, proc.stdout, proc.stderr


def command_exists(name):
    return shutil.which(name) is not None


def read_file(path, binary=False):
    mode = "rb" if binary else "r"
    with open(path, mode) as f:
        return f.read()


def safe_read_text(path):
    try:
        return read_file(path, binary=False)
    except Exception:
        return ""


def ensure_dir(path):
    if not os.path.isdir(path):
        os.makedirs(path)


def parse_proc_io_text(text):
    out = {}
    for line in text.splitlines():
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        try:
            out[k.strip()] = int(v.strip())
        except ValueError:
            continue
    return out


def read_proc_snapshot():
    snap = {}
    for pid in os.listdir("/proc"):
        if not pid.isdigit():
            continue
        base = os.path.join("/proc", pid)
        try:
            io_txt = read_file(os.path.join(base, "io"))
            stat_txt = read_file(os.path.join(base, "stat"))
            comm = safe_read_text(os.path.join(base, "comm")).strip() or "?"
            cmdline_raw = read_file(os.path.join(base, "cmdline"), binary=True)
            cmdline = cmdline_raw.replace(b"\x00", b" ").decode("utf-8", "replace").strip()
            stat_parts = stat_txt.split()
            starttime = int(stat_parts[21])
            io = parse_proc_io_text(io_txt)
            key = "{0}:{1}".format(pid, starttime)
            snap[key] = {
                "pid": int(pid),
                "starttime": starttime,
                "comm": comm,
                "cmdline": cmdline,
                "io": io,
            }
        except (FileNotFoundError, ProcessLookupError, PermissionError, IndexError, ValueError):
            continue
        except Exception:
            continue
    return snap


def diff_proc_snapshots(prev, curr, accum):
    tracked = ["rchar", "wchar", "syscr", "syscw", "read_bytes", "write_bytes", "cancelled_write_bytes"]
    for key, cur in curr.items():
        if key not in prev:
            continue
        old = prev[key]
        entry = accum.setdefault(key, {
            "pid": cur["pid"],
            "comm": cur["comm"],
            "cmdline": cur["cmdline"],
            "samples": 0,
            "rchar": 0,
            "wchar": 0,
            "syscr": 0,
            "syscw": 0,
            "read_bytes": 0,
            "write_bytes": 0,
            "cancelled_write_bytes": 0,
            "max_interval_read_bytes": 0,
            "max_interval_write_bytes": 0,
        })
        entry["samples"] += 1
        if cur.get("cmdline"):
            entry["cmdline"] = cur["cmdline"]
        for k in tracked:
            delta = cur["io"].get(k, 0) - old["io"].get(k, 0)
            if delta < 0:
                delta = 0
            entry[k] += delta
            if k == "read_bytes" and delta > entry["max_interval_read_bytes"]:
                entry["max_interval_read_bytes"] = delta
            if k == "write_bytes" and delta > entry["max_interval_write_bytes"]:
                entry["max_interval_write_bytes"] = delta


def get_pools(pool_arg=None):
    cmd = ["zpool", "list", "-H", "-o", "name"]
    rc, out, err = run_cmd(cmd, timeout=20)
    if rc != 0:
        raise RuntimeError("Nie mogę pobrać listy puli: {0}".format(err.strip()))
    pools = [x.strip() for x in out.splitlines() if x.strip()]
    if pool_arg:
        wanted = [x.strip() for x in pool_arg.split(",") if x.strip()]
        pools = [p for p in pools if p in wanted]
    return pools


def zpool_status_text(pools):
    cmd = ["zpool", "status"] + pools
    return run_cmd(cmd, timeout=30)[1]


def zpool_history_text(pools):
    if not pools:
        return ""
    cmd = ["zpool", "history", "-il"] + pools
    rc, out, err = run_cmd(cmd, timeout=60)
    if rc != 0:
        return "zpool history niedostępne: {0}\n".format(err.strip())
    return out


def zfs_get_properties(pools):
    props = "atime,relatime,primarycache,secondarycache,prefetch,recordsize,mountpoint"
    cmd = ["zfs", "get", "-H", "-r", "-o", "name,property,value", props] + pools
    rc, out, err = run_cmd(cmd, timeout=60)
    if rc != 0:
        return [{"error": err.strip()}]
    rows = []
    for line in out.splitlines():
        parts = line.split("\t")
        if len(parts) >= 3:
            rows.append({"name": parts[0], "property": parts[1], "value": parts[2]})
    return rows


def parse_arcstats():
    path = "/proc/spl/kstat/zfs/arcstats"
    if not os.path.exists(path):
        return None
    raw = safe_read_text(path)
    data = {}
    for line in raw.splitlines():
        parts = line.split()
        if len(parts) >= 3 and parts[0] not in ("name", "class"):
            try:
                data[parts[0]] = int(parts[2])
            except Exception:
                continue
    return data or None


def arc_delta(prev, curr):
    if not prev or not curr:
        return None
    keys = [
        "hits", "misses",
        "demand_data_hits", "demand_data_misses",
        "demand_metadata_hits", "demand_metadata_misses",
        "prefetch_data_hits", "prefetch_data_misses",
        "prefetch_metadata_hits", "prefetch_metadata_misses",
        "l2_hits", "l2_misses",
    ]
    out = {}
    for k in keys:
        out[k] = max(0, curr.get(k, 0) - prev.get(k, 0))
    return out


def parse_zpool_iostat_once(pool_list, interval):
    """
    Działa z różnymi wersjami zpool iostat.
    Próbuje najpierw z -l, potem bez -l.
    Nie zakłada obecności timestampa.
    """
    candidate_cmds = [
        ["zpool", "iostat", "-H", "-p", "-y", "-l"] + pool_list + [str(interval), "1"],
        ["zpool", "iostat", "-H", "-p", "-y"] + pool_list + [str(interval), "1"],
    ]

    last_err = ""
    out = ""
    for cmd in candidate_cmds:
        rc, out, err = run_cmd(cmd, timeout=interval + 20)
        if rc == 0 and out.strip():
            break
        last_err = (err or out or "").strip()
        out = ""
    if not out.strip():
        raise RuntimeError("zpool iostat failed: {0}".format(last_err))

    now_ts = int(time.time())
    rows = []

    for line in out.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split("\t") if "\t" in line else line.split()
        if len(parts) < 7:
            continue

        if parts[0].isdigit():
            ts = int(parts[0])
            data = parts[1:]
        else:
            ts = now_ts
            data = parts

        if len(data) < 7:
            continue

        try:
            row = {
                "ts": ts,
                "name": data[0],
                "alloc": int(data[1]),
                "free": int(data[2]),
                "rops": int(data[3]),
                "wops": int(data[4]),
                "rbytes": int(data[5]),
                "wbytes": int(data[6]),
            }
        except ValueError:
            continue

        latency_names = [
            "total_wait_ns", "disk_wait_ns", "syncq_wait_ns",
            "asyncq_wait_ns", "scrub_wait_ns", "trim_wait_ns", "rebuild_wait_ns",
        ]
        for i, key in enumerate(latency_names, start=7):
            if i < len(data):
                try:
                    row[key] = int(data[i])
                except ValueError:
                    row[key] = 0

        rows.append(row)

    return rows


def summarize_samples(samples_by_pool):
    summary = {}
    for pool, samples in samples_by_pool.items():
        rb = [s["rbytes"] for s in samples]
        wb = [s["wbytes"] for s in samples]
        ro = [s["rops"] for s in samples]
        wo = [s["wops"] for s in samples]
        tw = [s.get("total_wait_ns", 0) for s in samples]
        dw = [s.get("disk_wait_ns", 0) for s in samples]
        summary[pool] = {
            "samples": len(samples),
            "read_avg": mean_or_zero(rb),
            "write_avg": mean_or_zero(wb),
            "read_p95": percentile(rb, 0.95),
            "write_p95": percentile(wb, 0.95),
            "read_max": max(rb) if rb else 0,
            "write_max": max(wb) if wb else 0,
            "rops_avg": mean_or_zero(ro),
            "wops_avg": mean_or_zero(wo),
            "total_wait_avg_ns": mean_or_zero(tw),
            "disk_wait_avg_ns": mean_or_zero(dw),
            "total_wait_p95_ns": percentile(tw, 0.95),
            "disk_wait_p95_ns": percentile(dw, 0.95),
        }
    return summary


def top_entries(entries, key, topn):
    vals = [v for v in entries if v.get(key, 0) > 0]
    vals.sort(key=lambda x: x.get(key, 0), reverse=True)
    return vals[:topn]


def start_bpftrace(outdir, track_files=False):
    if os.geteuid() != 0:
        return None, "bpftrace pominięty: uruchom jako root."
    if not command_exists("bpftrace"):
        return None, "bpftrace pominięty: brak polecenia bpftrace."

    program = []
    program.append('tracepoint:syscalls:sys_exit_read /args.ret > 0/ { @read_bytes_by_comm[comm] = sum(args.ret); @read_calls_by_comm[comm] = count(); }')
    program.append('tracepoint:block:block_rq_issue { @block_bytes_by_comm[comm] = sum(args.bytes); @block_ios_by_comm[comm] = count(); }')
    if track_files:
        program.append('tracepoint:syscalls:sys_enter_openat { @opens[str(args.filename)] = count(); @opens_by_comm[comm, str(args.filename)] = count(); }')
    program.append('END {')
    program.append('  printf("===READ_BYTES_BY_COMM===\\n"); print(@read_bytes_by_comm);')
    program.append('  printf("===READ_CALLS_BY_COMM===\\n"); print(@read_calls_by_comm);')
    program.append('  printf("===BLOCK_BYTES_BY_COMM===\\n"); print(@block_bytes_by_comm);')
    program.append('  printf("===BLOCK_IOS_BY_COMM===\\n"); print(@block_ios_by_comm);')
    if track_files:
        program.append('  printf("===OPENS===\\n"); print(@opens);')
        program.append('  printf("===OPENS_BY_COMM===\\n"); print(@opens_by_comm);')
    program.append('}')

    bt_path = os.path.join(outdir, "trace.bt")
    with open(bt_path, "w") as f:
        f.write("\n".join(program) + "\n")

    out_path = os.path.join(outdir, "bpftrace.txt")
    out_f = open(out_path, "w")
    proc = subprocess.Popen(
        ["bpftrace", bt_path],
        stdout=out_f,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
    )
    return {"proc": proc, "out_f": out_f, "out_path": out_path, "bt_path": bt_path}, None


SECTION_RE = re.compile(r"^===([A-Z0-9_]+)===$")
MAP_LINE_RE = re.compile(r'^@[^[]+\[(.*)\]:\s+(-?\d+)$')


def parse_bpftrace_output(path):
    result = defaultdict(dict)
    if not path or not os.path.exists(path):
        return result
    section = None
    for raw in safe_read_text(path).splitlines():
        line = raw.strip()
        m = SECTION_RE.match(line)
        if m:
            section = m.group(1)
            continue
        m = MAP_LINE_RE.match(line)
        if not m or not section:
            continue
        key_raw = m.group(1).strip()
        value = int(m.group(2))
        try:
            parsed_key = ast.literal_eval(key_raw)
        except Exception:
            try:
                parsed_key = ast.literal_eval("({0},)".format(key_raw))
            except Exception:
                parsed_key = key_raw.strip('"')
        result[section][parsed_key] = value
    return result


def format_proc_entry(e):
    cmd = e.get("cmdline") or e.get("comm") or "?"
    if len(cmd) > 120:
        cmd = cmd[:117] + "..."
    return '{0} [pid {1}] {2}'.format(e.get("comm", "?"), e.get("pid", "?"), cmd)


def print_table(title, rows, cols):
    oprint(title)
    if not rows:
        oprint("  brak danych")
        oprint()
        return
    widths = []
    for col_name, key in cols:
        width = len(col_name)
        for row in rows:
            width = max(width, len(str(row.get(key, ""))))
        widths.append(width)
    header = "  " + "  ".join(col_name.ljust(widths[i]) for i, (col_name, _) in enumerate(cols))
    oprint(header)
    oprint("  " + "  ".join("-" * w for w in widths))
    for row in rows:
        oprint("  " + "  ".join(str(row.get(key, "")).ljust(widths[i]) for i, (_, key) in enumerate(cols)))
    oprint()


def stop_bpftrace(handle):
    if not handle:
        return
    proc = handle["proc"]
    if proc.poll() is not None:
        handle["out_f"].close()
        return

    try:
        proc.send_signal(signal.SIGINT)
        proc.wait(timeout=3)
    except Exception:
        try:
            proc.terminate()
            proc.wait(timeout=2)
        except Exception:
            try:
                proc.kill()
                proc.wait(timeout=2)
            except Exception:
                pass
    try:
        handle["out_f"].close()
    except Exception:
        pass


def main():
    ap = argparse.ArgumentParser(description="Zbiera próbki ZFS/ARC/procesów przez zadany czas i pokazuje top statystyki.")
    ap.add_argument("--duration", type=int, default=3600, help="Czas zbierania w sekundach, np. 7200")
    ap.add_argument("--interval", type=int, default=5, help="Interwał próbek w sekundach")
    ap.add_argument("--pool", default="", help="Opcjonalnie: jedna lub kilka puli, rozdzielone przecinkiem")
    ap.add_argument("--top", type=int, default=15, help="Ile pozycji pokazać w topkach")
    ap.add_argument("--outdir", default="", help="Katalog na logi i JSON")
    ap.add_argument("--track-files", action="store_true", help="Śledź top otwierane pliki przez bpftrace")
    ap.add_argument("--no-bpf", action="store_true", help="Nie uruchamiaj bpftrace nawet gdy jest dostępny")
    args = ap.parse_args()

    if args.interval <= 0 or args.duration <= 0:
        eprint("duration i interval muszą być > 0")
        sys.exit(2)

    if os.geteuid() != 0:
        eprint("Uwaga: bez roota część /proc i bpftrace może być niepełna.")

    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    outdir = args.outdir or os.path.abspath("./zfs_probe_{0}".format(ts))
    ensure_dir(outdir)

    oprint("Start. Zbieram dane do: {0}".format(outdir))
    pools = get_pools(args.pool)
    if not pools:
        eprint("Nie znaleziono żadnych puli ZFS.")
        sys.exit(1)

    oprint("Pule: {0}".format(", ".join(pools)))
    meta = {
        "started_at": datetime.now().isoformat(),
        "duration": args.duration,
        "interval": args.interval,
        "pools": pools,
        "hostname": os.uname().nodename,
        "uid": os.geteuid(),
    }
    with open(os.path.join(outdir, "meta.json"), "w") as f:
        json.dump(meta, f, indent=2)

    with open(os.path.join(outdir, "zpool_status_start.txt"), "w") as f:
        f.write(zpool_status_text(pools))
    with open(os.path.join(outdir, "zpool_history.txt"), "w") as f:
        f.write(zpool_history_text(pools))
    with open(os.path.join(outdir, "zfs_properties.json"), "w") as f:
        json.dump(zfs_get_properties(pools), f, indent=2)

    bpf_handle = None
    bpf_note = None
    if not args.no_bpf:
        bpf_handle, bpf_note = start_bpftrace(outdir, track_files=args.track_files)
    else:
        bpf_note = "bpftrace wyłączony przez --no-bpf"

    if bpf_note:
        oprint(bpf_note)

    proc_prev = read_proc_snapshot()
    arc_prev = parse_arcstats()

    samples_by_pool = defaultdict(list)
    arc_samples = []
    deadline = time.time() + args.duration
    rounds = max(1, int(math.ceil(float(args.duration) / float(args.interval))))
    proc_accum = {}

    for n in range(rounds):
        remaining = deadline - time.time()
        if remaining <= 0:
            break
        interval = min(args.interval, max(1, int(round(remaining))))
        oprint("[{0}/{1}] próbka, interwał {2}s...".format(n + 1, rounds, interval))
        try:
            rows = parse_zpool_iostat_once(pools, interval)
        except Exception as ex:
            eprint("Błąd zpool iostat: {0}".format(ex))
            break

        for row in rows:
            samples_by_pool[row["name"]].append(row)

        proc_cur = read_proc_snapshot()
        diff_proc_snapshots(proc_prev, proc_cur, proc_accum)
        proc_prev = proc_cur

        arc_cur = parse_arcstats()
        delta = arc_delta(arc_prev, arc_cur)
        if delta is not None:
            arc_samples.append(delta)
        arc_prev = arc_cur

        oprint("[{0}/{1}] gotowe".format(n + 1, rounds))

    if bpf_handle:
        oprint("Kończę bpftrace...")
        stop_bpftrace(bpf_handle)

    with open(os.path.join(outdir, "zpool_status_end.txt"), "w") as f:
        f.write(zpool_status_text(pools))

    with open(os.path.join(outdir, "samples_zpool.json"), "w") as f:
        json.dump(samples_by_pool, f, indent=2)
    with open(os.path.join(outdir, "samples_arc.json"), "w") as f:
        json.dump(arc_samples, f, indent=2)
    with open(os.path.join(outdir, "proc_totals.json"), "w") as f:
        json.dump(proc_accum, f, indent=2)

    sample_summary = summarize_samples(samples_by_pool)
    bpf = parse_bpftrace_output(bpf_handle["out_path"] if bpf_handle else "")

    proc_rows = list(proc_accum.values())

    top_proc_read = []
    for e in top_entries(proc_rows, "read_bytes", args.top):
        top_proc_read.append({
            "proc": format_proc_entry(e),
            "read": human_bytes(e["read_bytes"]),
            "write": human_bytes(e["write_bytes"]),
            "max_read_interval": human_bytes(e["max_interval_read_bytes"]),
            "syscr": e["syscr"],
        })

    top_proc_write = []
    for e in top_entries(proc_rows, "write_bytes", args.top):
        top_proc_write.append({
            "proc": format_proc_entry(e),
            "write": human_bytes(e["write_bytes"]),
            "read": human_bytes(e["read_bytes"]),
            "max_write_interval": human_bytes(e["max_interval_write_bytes"]),
            "syscw": e["syscw"],
        })

    oprint()
    oprint("Wyniki zapisane w: {0}".format(outdir))
    oprint("Pule: {0}".format(", ".join(pools)))
    oprint("Czas zbierania: {0}s, interwał: {1}s, próbek: {2}".format(
        args.duration, args.interval, sum(len(v) for v in samples_by_pool.values())
    ))
    oprint()

    pool_rows = []
    for pool in pools:
        s = sample_summary.get(pool, {})
        pool_rows.append({
            "pool": pool,
            "avg_read/s": human_bytes(s.get("read_avg", 0)),
            "avg_write/s": human_bytes(s.get("write_avg", 0)),
            "p95_read/s": human_bytes(s.get("read_p95", 0)),
            "p95_write/s": human_bytes(s.get("write_p95", 0)),
            "max_read/s": human_bytes(s.get("read_max", 0)),
            "max_write/s": human_bytes(s.get("write_max", 0)),
            "avg_total_wait": human_ns_to_ms(s.get("total_wait_avg_ns", 0)),
            "avg_disk_wait": human_ns_to_ms(s.get("disk_wait_avg_ns", 0)),
        })

    print_table("Podsumowanie puli", pool_rows, [
        ("pool", "pool"),
        ("avg_read/s", "avg_read/s"),
        ("avg_write/s", "avg_write/s"),
        ("p95_read/s", "p95_read/s"),
        ("p95_write/s", "p95_write/s"),
        ("avg_total_wait", "avg_total_wait"),
        ("avg_disk_wait", "avg_disk_wait"),
    ])

    print_table("Top procesy wg read_bytes z /proc/<pid>/io", top_proc_read, [
        ("proc", "proc"),
        ("read", "read"),
        ("write", "write"),
        ("max_read_interval", "max_read_interval"),
        ("syscr", "syscr"),
    ])

    print_table("Top procesy wg write_bytes z /proc/<pid>/io", top_proc_write, [
        ("proc", "proc"),
        ("write", "write"),
        ("read", "read"),
        ("max_write_interval", "max_write_interval"),
        ("syscw", "syscw"),
    ])

    if arc_samples:
        hits = sum(x.get("hits", 0) for x in arc_samples)
        misses = sum(x.get("misses", 0) for x in arc_samples)
        total = hits + misses
        hitp = (100.0 * hits / total) if total else 0.0
        l2_hits = sum(x.get("l2_hits", 0) for x in arc_samples)
        l2_misses = sum(x.get("l2_misses", 0) for x in arc_samples)
        l2_total = l2_hits + l2_misses
        l2p = (100.0 * l2_hits / l2_total) if l2_total else 0.0
        oprint("ARC/L2ARC")
        oprint("  ARC hit rate: {0:.2f}%  (hits={1}, misses={2})".format(hitp, hits, misses))
        oprint("  L2ARC hit rate: {0:.2f}% (hits={1}, misses={2})".format(l2p, l2_hits, l2_misses))
        oprint()

    if bpf:
        rb = bpf.get("READ_BYTES_BY_COMM", {})
        if rb:
            rows = []
            for comm, value in sorted(rb.items(), key=lambda kv: kv[1], reverse=True)[:args.top]:
                rows.append({"comm": comm, "read_bytes": human_bytes(value)})
            print_table("Top comm wg read() bajtów z bpftrace", rows, [
                ("comm", "comm"),
                ("read_bytes", "read_bytes"),
            ])

        bb = bpf.get("BLOCK_BYTES_BY_COMM", {})
        if bb:
            rows = []
            for comm, value in sorted(bb.items(), key=lambda kv: kv[1], reverse=True)[:args.top]:
                rows.append({"comm": comm, "block_bytes": human_bytes(value)})
            print_table("Top comm wg block_rq_issue bajtów z bpftrace", rows, [
                ("comm", "comm"),
                ("block_bytes", "block_bytes"),
            ])

        opens = bpf.get("OPENS", {})
        if opens:
            rows = []
            for path, value in sorted(opens.items(), key=lambda kv: kv[1], reverse=True)[:args.top]:
                rows.append({"path": path, "opens": value})
            print_table("Top otwierane pliki z bpftrace", rows, [
                ("path", "path"),
                ("opens", "opens"),
            ])

    oprint("Najważniejsze pliki wynikowe:")
    for name in [
        "zpool_status_start.txt",
        "zpool_status_end.txt",
        "zpool_history.txt",
        "zfs_properties.json",
        "samples_zpool.json",
        "samples_arc.json",
        "proc_totals.json",
        "bpftrace.txt",
    ]:
        path = os.path.join(outdir, name)
        if os.path.exists(path):
            oprint("  {0}".format(path))


if __name__ == "__main__":
    main()