Add check_dockers
This commit is contained in:
346
check_dockers
Normal file
346
check_dockers
Normal file
@@ -0,0 +1,346 @@
|
||||
#!/usr/bin/env python3
|
||||
import configparser
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
CONFIG_PATH = "/etc/docker-monitoring/containers.ini"
|
||||
|
||||
|
||||
def run(cmd: List[str]) -> Tuple[int, str, str]:
|
||||
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
return p.returncode, p.stdout.strip(), p.stderr.strip()
|
||||
|
||||
|
||||
def nagios_exit(code: int, msg: str):
|
||||
print(msg)
|
||||
sys.exit(code)
|
||||
|
||||
|
||||
def ensure_dir(path: str):
|
||||
d = os.path.dirname(path)
|
||||
if d and not os.path.isdir(d):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
|
||||
def parse_bool(v: str, default: bool = False) -> bool:
|
||||
if v is None:
|
||||
return default
|
||||
return str(v).strip().lower() in ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
def parse_float(v: str, default: float = 0.0) -> float:
|
||||
try:
|
||||
return float(str(v).strip())
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def docker_available() -> bool:
|
||||
rc, _, _ = run(["docker", "info", "--format", "{{json .}}"])
|
||||
return rc == 0
|
||||
|
||||
|
||||
def list_containers_all() -> List[str]:
|
||||
rc, out, err = run(["docker", "ps", "-a", "--format", "{{.Names}}"])
|
||||
if rc != 0:
|
||||
nagios_exit(3, f"UNKNOWN - cannot list containers: {err}")
|
||||
return [x.strip() for x in out.splitlines() if x.strip()]
|
||||
|
||||
|
||||
def inspect_container(name: str) -> Dict:
|
||||
rc, out, _ = run(["docker", "inspect", name])
|
||||
if rc != 0:
|
||||
return {}
|
||||
try:
|
||||
arr = json.loads(out)
|
||||
return arr[0] if arr else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def detect_container_type(inspect: Dict) -> str:
|
||||
labels = (inspect.get("Config") or {}).get("Labels") or {}
|
||||
if "com.docker.compose.project" in labels:
|
||||
return "compose"
|
||||
if "com.docker.stack.namespace" in labels:
|
||||
return "stack"
|
||||
return "standalone"
|
||||
|
||||
|
||||
def get_ip(inspect: Dict) -> str:
|
||||
nets = ((inspect.get("NetworkSettings") or {}).get("Networks") or {})
|
||||
for _, v in nets.items():
|
||||
ip = v.get("IPAddress")
|
||||
if ip:
|
||||
return ip
|
||||
return "-"
|
||||
|
||||
|
||||
def get_started_at(inspect: Dict) -> str:
|
||||
return ((inspect.get("State") or {}).get("StartedAt")) or "-"
|
||||
|
||||
|
||||
def get_restart_count(inspect: Dict) -> int:
|
||||
try:
|
||||
return int(inspect.get("RestartCount", 0))
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def get_running(inspect: Dict) -> bool:
|
||||
return bool(((inspect.get("State") or {}).get("Running")))
|
||||
|
||||
|
||||
def get_status_text(inspect: Dict) -> str:
|
||||
return ((inspect.get("State") or {}).get("Status")) or "unknown"
|
||||
|
||||
|
||||
def create_default_config(path: str):
|
||||
ensure_dir(path)
|
||||
cfg = configparser.ConfigParser()
|
||||
cfg["global"] = {
|
||||
"monitor_resources": "true",
|
||||
"default_cpu_warn": "80",
|
||||
"default_cpu_crit": "95",
|
||||
"default_mem_warn": "80",
|
||||
"default_mem_crit": "95",
|
||||
"default_restart_warn": "5",
|
||||
"default_restart_crit": "20",
|
||||
"skip_types": "compose,stack",
|
||||
}
|
||||
|
||||
for name in list_containers_all():
|
||||
info = inspect_container(name)
|
||||
ctype = detect_container_type(info)
|
||||
labels = (info.get("Config") or {}).get("Labels") or {}
|
||||
|
||||
section = {
|
||||
"enabled": "true",
|
||||
"type": ctype,
|
||||
"monitor_liveness": "true",
|
||||
"monitor_resources": "true",
|
||||
"cpu_warn": "",
|
||||
"cpu_crit": "",
|
||||
"mem_warn": "",
|
||||
"mem_crit": "",
|
||||
"restart_warn": "",
|
||||
"restart_crit": "",
|
||||
}
|
||||
|
||||
if ctype == "compose":
|
||||
section["note"] = f"auto-detected compose project: {labels.get('com.docker.compose.project', '-')}"
|
||||
elif ctype == "stack":
|
||||
section["note"] = f"auto-detected stack: {labels.get('com.docker.stack.namespace', '-')}"
|
||||
|
||||
cfg[name] = section
|
||||
|
||||
with open(path, "w") as f:
|
||||
cfg.write(f)
|
||||
|
||||
|
||||
def load_or_create_config(path: str) -> configparser.ConfigParser:
|
||||
if not os.path.exists(path):
|
||||
create_default_config(path)
|
||||
|
||||
cfg = configparser.ConfigParser()
|
||||
cfg.read(path)
|
||||
|
||||
if "global" not in cfg:
|
||||
cfg["global"] = {
|
||||
"monitor_resources": "true",
|
||||
"default_cpu_warn": "80",
|
||||
"default_cpu_crit": "95",
|
||||
"default_mem_warn": "80",
|
||||
"default_mem_crit": "95",
|
||||
"default_restart_warn": "5",
|
||||
"default_restart_crit": "20",
|
||||
"skip_types": "compose,stack",
|
||||
}
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
def get_stats_one_shot() -> Dict[str, Dict[str, str]]:
|
||||
cmd = [
|
||||
"docker", "stats", "--no-stream",
|
||||
"--format", "{{.Name}}|{{.CPUPerc}}|{{.MemPerc}}|{{.MemUsage}}"
|
||||
]
|
||||
rc, out, _ = run(cmd)
|
||||
if rc != 0:
|
||||
return {}
|
||||
|
||||
stats = {}
|
||||
for line in out.splitlines():
|
||||
parts = line.split("|", 3)
|
||||
if len(parts) != 4:
|
||||
continue
|
||||
name, cpu, memp, memu = parts
|
||||
stats[name.strip()] = {
|
||||
"cpu": cpu.strip().replace("%", ""),
|
||||
"mem_perc": memp.strip().replace("%", ""),
|
||||
"mem_usage": memu.strip(),
|
||||
}
|
||||
return stats
|
||||
|
||||
|
||||
def state_max(a: int, b: int) -> int:
|
||||
return a if a > b else b
|
||||
|
||||
|
||||
def evaluate_container(
|
||||
name: str,
|
||||
section: configparser.SectionProxy,
|
||||
global_cfg: configparser.SectionProxy,
|
||||
stats: Dict[str, Dict[str, str]]
|
||||
) -> Tuple[int, str]:
|
||||
info = inspect_container(name)
|
||||
if not info:
|
||||
return 2, f"CRITICAL - {name} container not found"
|
||||
|
||||
running = get_running(info)
|
||||
status_txt = get_status_text(info)
|
||||
ip = get_ip(info)
|
||||
started = get_started_at(info)
|
||||
restarts = get_restart_count(info)
|
||||
|
||||
code = 0
|
||||
messages = []
|
||||
|
||||
monitor_liveness = parse_bool(section.get("monitor_liveness", "true"), True)
|
||||
monitor_resources = parse_bool(
|
||||
section.get("monitor_resources", global_cfg.get("monitor_resources", "true")),
|
||||
True
|
||||
)
|
||||
|
||||
if monitor_liveness:
|
||||
if running:
|
||||
messages.append(f"OK - {name} is running. IP: {ip}, StartedAt: {started}")
|
||||
else:
|
||||
code = 2
|
||||
messages.append(f"CRITICAL - {name} is not running (status: {status_txt})")
|
||||
|
||||
restart_warn = parse_float(section.get("restart_warn") or global_cfg.get("default_restart_warn", "5"), 5)
|
||||
restart_crit = parse_float(section.get("restart_crit") or global_cfg.get("default_restart_crit", "20"), 20)
|
||||
|
||||
if restarts >= restart_crit:
|
||||
code = state_max(code, 2)
|
||||
messages.append(f"CRITICAL - {name} restart count {restarts} >= {restart_crit}")
|
||||
elif restarts >= restart_warn:
|
||||
code = state_max(code, 1)
|
||||
messages.append(f"WARNING - {name} restart count {restarts} >= {restart_warn}")
|
||||
|
||||
if monitor_resources and name in stats:
|
||||
cpu = parse_float(stats[name].get("cpu", "0"), 0)
|
||||
memp = parse_float(stats[name].get("mem_perc", "0"), 0)
|
||||
memu = stats[name].get("mem_usage", "-")
|
||||
|
||||
cpu_warn = parse_float(section.get("cpu_warn") or global_cfg.get("default_cpu_warn", "80"), 80)
|
||||
cpu_crit = parse_float(section.get("cpu_crit") or global_cfg.get("default_cpu_crit", "95"), 95)
|
||||
mem_warn = parse_float(section.get("mem_warn") or global_cfg.get("default_mem_warn", "80"), 80)
|
||||
mem_crit = parse_float(section.get("mem_crit") or global_cfg.get("default_mem_crit", "95"), 95)
|
||||
|
||||
if cpu >= cpu_crit:
|
||||
code = state_max(code, 2)
|
||||
messages.append(f"CRITICAL - {name} CPU {cpu:.1f}% >= {cpu_crit}%")
|
||||
elif cpu >= cpu_warn:
|
||||
code = state_max(code, 1)
|
||||
messages.append(f"WARNING - {name} CPU {cpu:.1f}% >= {cpu_warn}%")
|
||||
|
||||
if memp >= mem_crit:
|
||||
code = state_max(code, 2)
|
||||
messages.append(f"CRITICAL - {name} MEM {memp:.1f}% >= {mem_crit}% ({memu})")
|
||||
elif memp >= mem_warn:
|
||||
code = state_max(code, 1)
|
||||
messages.append(f"WARNING - {name} MEM {memp:.1f}% >= {mem_warn}% ({memu})")
|
||||
|
||||
if code == 0:
|
||||
return 0, f"OK - {name} is running. IP: {ip}, StartedAt: {started}"
|
||||
|
||||
return code, " ; ".join(messages)
|
||||
|
||||
|
||||
def main():
|
||||
config_path = CONFIG_PATH
|
||||
verbose = False
|
||||
|
||||
i = 1
|
||||
while i < len(sys.argv):
|
||||
arg = sys.argv[i]
|
||||
if arg in ("-c", "--config") and i + 1 < len(sys.argv):
|
||||
config_path = sys.argv[i + 1]
|
||||
i += 2
|
||||
continue
|
||||
elif arg in ("--init", "--init-only"):
|
||||
create_default_config(config_path)
|
||||
print(f"Created {config_path}")
|
||||
sys.exit(0)
|
||||
elif arg in ("-v", "--verbose"):
|
||||
verbose = True
|
||||
i += 1
|
||||
|
||||
if not docker_available():
|
||||
nagios_exit(3, "UNKNOWN - docker is not available or permission denied")
|
||||
|
||||
cfg = load_or_create_config(config_path)
|
||||
global_cfg = cfg["global"]
|
||||
stats = get_stats_one_shot()
|
||||
|
||||
enabled = []
|
||||
for sec in cfg.sections():
|
||||
if sec == "global":
|
||||
continue
|
||||
if parse_bool(cfg[sec].get("enabled", "false"), False):
|
||||
enabled.append(sec)
|
||||
|
||||
if not enabled:
|
||||
nagios_exit(3, f"UNKNOWN - no enabled containers in {config_path}")
|
||||
|
||||
overall = 0
|
||||
ok_msgs = []
|
||||
warn_msgs = []
|
||||
crit_msgs = []
|
||||
|
||||
for name in enabled:
|
||||
code, txt = evaluate_container(name, cfg[name], global_cfg, stats)
|
||||
overall = state_max(overall, code)
|
||||
|
||||
if code == 0:
|
||||
ok_msgs.append(txt)
|
||||
elif code == 1:
|
||||
warn_msgs.append(txt)
|
||||
elif code == 2:
|
||||
crit_msgs.append(txt)
|
||||
|
||||
ok_count = len(ok_msgs)
|
||||
warn_count = len(warn_msgs)
|
||||
crit_count = len(crit_msgs)
|
||||
total = len(enabled)
|
||||
|
||||
perfdata = f"containers_ok={ok_count} containers_warning={warn_count} containers_critical={crit_count}"
|
||||
|
||||
if overall == 0:
|
||||
if verbose:
|
||||
print(f"OK - checked {total} container(s), all running | {perfdata}")
|
||||
print("\n".join(ok_msgs))
|
||||
else:
|
||||
print(f"OK - checked {total} container(s), all running | {perfdata}")
|
||||
sys.exit(0)
|
||||
|
||||
prefix = "WARNING" if overall == 1 else "CRITICAL"
|
||||
print(f"{prefix} - checked {total} container(s): {crit_count} critical, {warn_count} warning, {ok_count} ok | {perfdata}")
|
||||
|
||||
details = crit_msgs + warn_msgs
|
||||
if verbose:
|
||||
details += ok_msgs
|
||||
|
||||
if details:
|
||||
print("\n".join(details))
|
||||
|
||||
sys.exit(overall)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user