#!/usr/bin/env python3 """ Nagios plugin for comprehensive DRBD/LINSTOR monitoring Author: Custom Nagios DRBD Check License: GPL v3 Version: 2.0 - Enhanced with full Linstor monitoring """ import argparse import sys import subprocess import json import re from typing import Dict, List, Tuple # Nagios exit codes STATE_OK = 0 STATE_WARNING = 1 STATE_CRITICAL = 2 STATE_UNKNOWN = 3 class DRBDMonitor: def __init__(self): self.resources = {} self.perfdata = [] self.warnings = [] self.criticals = [] self.ok_messages = [] def execute_command(self, cmd: List[str]) -> Tuple[int, str, str]: """Execute system command and return returncode, stdout, stderr""" try: proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True ) stdout, stderr = proc.communicate(timeout=30) return proc.returncode, stdout, stderr except subprocess.TimeoutExpired: return 124, "", "Command timeout" except Exception as e: return 1, "", str(e) def parse_linstor_json(self, stdout: str) -> List: """Parse Linstor JSON - handles [[...]] structure""" try: data = json.loads(stdout) # Linstor returns [[...]] structure if isinstance(data, list) and len(data) > 0: if isinstance(data[0], list) and len(data[0]) > 0: return data[0] # Return inner list return data return [] except: return [] def parse_events2_output(self, output: str) -> Dict: """Parse drbdsetup events2 --now --statistics output""" data = { 'resources': {}, 'connections': {}, 'devices': {}, 'peer_devices': {} } for line in output.strip().split('\n'): if not line or line.startswith('#'): continue parts = line.split() if len(parts) < 2: continue event_type = parts[0] # exists, create, change, destroy object_type = parts[1] # resource, connection, device, peer-device # Parse key:value pairs props = {} for part in parts[2:]: if ':' in part: key, value = part.split(':', 1) props[key] = value # Store data by object type if object_type == 'resource': res_name = props.get('name', 'unknown') data['resources'][res_name] = props elif object_type == 'connection': conn_name = props.get('name', 'unknown') data['connections'][conn_name] = props elif object_type == 'device': dev_name = props.get('name', 'unknown') volume = props.get('volume', '0') key = f"{dev_name}:{volume}" data['devices'][key] = props elif object_type == 'peer-device': peer_name = props.get('name', 'unknown') volume = props.get('volume', '0') key = f"{peer_name}:{volume}" data['peer_devices'][key] = props return data def check_resource_status(self, resource_data: Dict, args): """Check resource role and status""" for res_name, props in resource_data.items(): role = props.get('role', 'Unknown') suspended = props.get('suspended', 'no') may_promote = props.get('may_promote', 'no') # Check role if args.check_role: if role not in ['Primary', 'Secondary']: self.criticals.append(f"Resource {res_name}: Invalid role {role}") elif role == 'Primary': self.ok_messages.append(f"Resource {res_name}: Role={role}") elif role == 'Secondary' and args.require_primary: self.warnings.append(f"Resource {res_name}: Role is Secondary, expected Primary") else: self.ok_messages.append(f"Resource {res_name}: Role={role}") # Check suspended state if args.check_suspended and suspended == 'yes': self.criticals.append(f"Resource {res_name}: SUSPENDED") # Check promotion capability - ONLY if resource is Secondary and explicitly required if args.check_promotion and may_promote == 'no' and role == 'Secondary': if args.require_promotion_capability: self.warnings.append(f"Resource {res_name}: Cannot be promoted") def check_connection_status(self, connection_data: Dict, args): """Check connection state between peers""" for conn_name, props in connection_data.items(): connection = props.get('connection', 'Unknown') role = props.get('role', 'Unknown') congested = props.get('congested', 'no') peer_node_id = props.get('peer-node-id', 'unknown') # Check connection state if args.check_connection: if connection not in ['Connected', 'StandAlone']: if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure', 'ProtocolError', 'TearDown', 'Unconnected', 'Disconnecting']: self.criticals.append(f"Connection {conn_name}: State={connection}") elif connection in ['WFConnection', 'WFReportParams']: self.warnings.append(f"Connection {conn_name}: State={connection}") else: self.criticals.append(f"Connection {conn_name}: Unknown state {connection}") else: self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}") # Check congestion if args.check_congestion and congested == 'yes': if not args.ignore_transient_congestion: self.warnings.append(f"Connection {conn_name}: CONGESTED") def check_device_status(self, device_data: Dict, args): """Check device/volume disk state""" for dev_key, props in device_data.items(): dev_name = props.get('name', 'unknown') volume = props.get('volume', '0') disk = props.get('disk', 'Unknown') minor = props.get('minor', 'unknown') client = props.get('client', 'no') quorum = props.get('quorum', 'yes') # Check disk state if args.check_disk: if disk not in ['UpToDate', 'Diskless']: if disk in ['Failed', 'Detaching']: self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}") elif disk in ['Inconsistent', 'Outdated', 'DUnknown']: self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}") elif disk in ['Attaching', 'Negotiating']: self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}") else: self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}") else: self.ok_messages.append(f"Device {dev_name} vol:{volume} (minor:{minor}): Disk={disk}") # Check quorum - but not for diskless clients if args.check_quorum and quorum == 'no' and disk != 'Diskless': self.criticals.append(f"Device {dev_name} vol:{volume}: NO QUORUM") # Check client mode if args.check_client and client == 'yes': self.ok_messages.append(f"Device {dev_name} vol:{volume}: Running in client mode") def check_peer_device_status(self, peer_device_data: Dict, args): """Check peer device replication state""" for peer_key, props in peer_device_data.items(): peer_name = props.get('name', 'unknown') volume = props.get('volume', '0') replication = props.get('replication', 'Unknown') peer_disk = props.get('peer-disk', 'Unknown') resync_suspended = props.get('resync-suspended', 'no') peer_client = props.get('peer-client', 'no') peer_node_id = props.get('peer-node-id', 'unknown') # Check replication state if args.check_replication: if replication not in ['Established', 'Off']: if replication in ['SyncSource', 'SyncTarget']: if args.warn_on_sync: self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})") else: self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})") elif replication in ['PausedSyncS', 'PausedSyncT']: self.warnings.append(f"Peer {peer_name} vol:{volume}: Sync paused ({replication})") elif replication in ['WFBitMapS', 'WFBitMapT', 'WFSyncUUID']: self.warnings.append(f"Peer {peer_name} vol:{volume}: Waiting for sync ({replication})") elif replication in ['StartingSyncS', 'StartingSyncT']: self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Starting sync ({replication})") elif replication in ['VerifyS', 'VerifyT']: self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Verifying ({replication})") else: self.criticals.append(f"Peer {peer_name} vol:{volume}: Replication={replication}") else: self.ok_messages.append(f"Peer {peer_name} (node-{peer_node_id}) vol:{volume}: {replication}") # Check peer disk state if args.check_peer_disk: if peer_disk not in ['UpToDate', 'Diskless', 'DUnknown']: if peer_disk in ['Failed', 'Outdated', 'Inconsistent']: self.criticals.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}") else: self.warnings.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}") # Check resync suspended if args.check_resync_suspended and resync_suspended == 'yes': self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED") def check_linstor_nodes(self, args): """Check Linstor node status and communication""" if not args.check_linstor: return cmd = ['linstor', '-m', 'node', 'list'] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: if args.verbose: self.warnings.append(f"LINSTOR: Cannot get node list") return try: nodes = self.parse_linstor_json(stdout) online_nodes = 0 offline_nodes = [] evicted_nodes = [] for node in nodes: if not isinstance(node, dict): continue node_name = node.get('name', 'unknown') connection_status = node.get('connection_status', 'UNKNOWN') node_type = node.get('type', 'UNKNOWN') flags = node.get('flags', []) # Check for EVICTED flag if 'EVICTED' in flags: evicted_nodes.append(f"{node_name}({node_type})") continue if connection_status == 'ONLINE': online_nodes += 1 else: offline_nodes.append(f"{node_name}({node_type}):{connection_status}") if offline_nodes: self.criticals.append(f"LINSTOR: Nodes offline: {', '.join(offline_nodes)}") if evicted_nodes: self.warnings.append(f"LINSTOR: Nodes evicted: {', '.join(evicted_nodes)}") if not offline_nodes and not evicted_nodes and online_nodes > 0: self.ok_messages.append(f"LINSTOR: All {online_nodes} nodes online") if args.performance_data: self.perfdata.append(f"linstor_nodes_online={online_nodes}") self.perfdata.append(f"linstor_nodes_offline={len(offline_nodes)}") except Exception as e: if args.verbose: self.warnings.append(f"LINSTOR: Error checking nodes: {str(e)[:50]}") def check_linstor_storage_pools(self, args): """Check Linstor storage pool status and errors""" if not args.check_linstor: return cmd = ['linstor', '-m', 'storage-pool', 'list'] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: if args.verbose: self.warnings.append(f"LINSTOR: Cannot get storage pool list") return try: storage_pools = self.parse_linstor_json(stdout) pool_errors = [] pool_warnings = [] total_capacity = 0 total_free = 0 pools_checked = 0 for pool in storage_pools: if not isinstance(pool, dict): continue node_name = pool.get('node_name', 'unknown') pool_name = pool.get('stor_pool_name', 'unknown') provider_kind = pool.get('provider_kind', 'unknown') # Check for errors in reports reports = pool.get('reports', []) for report in reports: if not isinstance(report, dict): continue message = report.get('message', '') ret_code = report.get('ret_code', 0) # Check for specific error patterns if 'ApiRcException' in message or 'RuntimeException' in message: pool_errors.append(f"{node_name}:{pool_name} - ApiRcException") elif 'Failed to query free space' in message: pool_errors.append(f"{node_name}:{pool_name} - Cannot query free space") elif 'query free space from storage pool' in message.lower(): pool_errors.append(f"{node_name}:{pool_name} - Storage pool query failed") elif ret_code not in [0, None] and message: pool_warnings.append(f"{node_name}:{pool_name} - {message[:60]}") # Skip DISKLESS pools if provider_kind == 'DISKLESS': continue # Check free space free_capacity = pool.get('free_capacity', None) total_capacity_pool = pool.get('total_capacity', None) if free_capacity is not None and total_capacity_pool is not None: pools_checked += 1 total_capacity += total_capacity_pool total_free += free_capacity # Calculate usage percentage if total_capacity_pool > 0: usage_percent = ((total_capacity_pool - free_capacity) / total_capacity_pool) * 100 if usage_percent >= 95: pool_errors.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full") elif usage_percent >= 90: pool_warnings.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full") else: # Cannot read capacity - potential issue if provider_kind not in ['DISKLESS']: pool_warnings.append(f"{node_name}:{pool_name} - Cannot read capacity") # Report errors if pool_errors: self.criticals.append(f"LINSTOR: Storage pool errors: {'; '.join(pool_errors[:3])}") if pool_warnings: self.warnings.append(f"LINSTOR: Storage pool warnings: {'; '.join(pool_warnings[:3])}") if not pool_errors and not pool_warnings and pools_checked > 0: self.ok_messages.append(f"LINSTOR: {pools_checked} storage pools OK") # Performance data if total_capacity > 0 and args.performance_data: usage_percent = ((total_capacity - total_free) / total_capacity) * 100 self.perfdata.append(f"linstor_storage_usage={usage_percent:.2f}%") self.perfdata.append(f"linstor_storage_free_mb={total_free // 1024}") self.perfdata.append(f"linstor_storage_total_mb={total_capacity // 1024}") except Exception as e: if args.verbose: self.warnings.append(f"LINSTOR: Error checking storage pools: {str(e)[:50]}") def check_linstor_resources(self, args): """Check Linstor resource status and synchronization""" if not args.check_linstor: return cmd = ['linstor', '-m', 'resource', 'list'] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: if args.verbose: self.warnings.append(f"LINSTOR: Cannot get resource list") return try: resources = self.parse_linstor_json(stdout) syncing_resources = [] error_resources = [] total_volumes = 0 uptodate_count = 0 for resource in resources: if not isinstance(resource, dict): continue node_name = resource.get('node_name', 'unknown') resource_name = resource.get('name', 'unknown') # Check volumes for sync status volumes = resource.get('vlms', []) for volume in volumes: if not isinstance(volume, dict): continue total_volumes += 1 vol_nr = volume.get('vlm_nr', 0) # Check DRBD state if 'layer_data_list' in volume: for layer in volume['layer_data_list']: if not isinstance(layer, dict): continue if layer.get('type') == 'DRBD': drbd_data = layer.get('data', {}) disk_state = drbd_data.get('disk_state', '') replication_state = drbd_data.get('repl_state', '') # Check for UpToDate if disk_state == 'UpToDate': uptodate_count += 1 # Check for sync in progress if replication_state in ['SyncSource', 'SyncTarget', 'PausedSyncS', 'PausedSyncT']: syncing_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})") # Check for error states if disk_state in ['Failed', 'Inconsistent']: error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({disk_state})") if replication_state in ['StandAlone', 'Disconnecting', 'NetworkFailure']: error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})") # Report sync status if syncing_resources: sync_count = len(syncing_resources) if sync_count <= 3: self.warnings.append(f"LINSTOR: Resources syncing: {', '.join(syncing_resources)}") else: self.warnings.append(f"LINSTOR: {sync_count} resources syncing: {', '.join(syncing_resources[:3])}...") if error_resources: self.criticals.append(f"LINSTOR: Resources in error state: {', '.join(error_resources)}") if not syncing_resources and not error_resources and total_volumes > 0: self.ok_messages.append(f"LINSTOR: {uptodate_count}/{total_volumes} volumes UpToDate") # Performance data if args.performance_data: self.perfdata.append(f"linstor_volumes_total={total_volumes}") self.perfdata.append(f"linstor_volumes_syncing={len(syncing_resources)}") self.perfdata.append(f"linstor_volumes_errors={len(error_resources)}") self.perfdata.append(f"linstor_volumes_uptodate={uptodate_count}") except Exception as e: if args.verbose: self.warnings.append(f"LINSTOR: Error checking resources: {str(e)[:50]}") def check_linstor_error_reports(self, args): """Check for Linstor error reports in the system""" if not args.check_linstor or not args.check_linstor_errors: return cmd = ['linstor', '-m', 'error-reports', 'list'] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: return try: error_reports = self.parse_linstor_json(stdout) if error_reports and len(error_reports) > 0: report_count = len(error_reports) # Apply threshold if report_count > args.linstor_error_critical: self.criticals.append(f"LINSTOR: {report_count} error reports in system") elif report_count > args.linstor_error_warning: self.warnings.append(f"LINSTOR: {report_count} error reports in system") elif args.verbose: self.ok_messages.append(f"LINSTOR: {report_count} error reports (below threshold)") if args.performance_data: self.perfdata.append(f"linstor_error_reports={report_count}") except: pass def add_performance_data(self, device_data: Dict, peer_device_data: Dict, args): """Add performance data for Nagios""" if not args.performance_data: return # Count resources, devices, connections resource_count = len(device_data) self.perfdata.append(f"drbd_resources={resource_count}") # Count by disk state uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate') self.perfdata.append(f"drbd_uptodate_devices={uptodate}") # Count by replication state established = sum(1 for p in peer_device_data.values() if p.get('replication') == 'Established') syncing = sum(1 for p in peer_device_data.values() if p.get('replication') in ['SyncSource', 'SyncTarget']) self.perfdata.append(f"drbd_established_replications={established}") self.perfdata.append(f"drbd_syncing_replications={syncing}") def run_checks(self, args): """Main check execution""" # Get DRBD events2 output resource_filter = args.resource if args.resource else 'all' cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource_filter] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: print(f"CRITICAL - Failed to execute drbdsetup: {stderr}") sys.exit(STATE_CRITICAL) if not stdout.strip(): print("CRITICAL - No DRBD resources found") sys.exit(STATE_CRITICAL) # Parse output data = self.parse_events2_output(stdout) # Run DRBD checks if data['resources']: self.check_resource_status(data['resources'], args) if data['connections']: self.check_connection_status(data['connections'], args) if data['devices']: self.check_device_status(data['devices'], args) if data['peer_devices']: self.check_peer_device_status(data['peer_devices'], args) # Add DRBD performance data self.add_performance_data(data['devices'], data['peer_devices'], args) # Run LINSTOR checks if enabled if args.check_linstor: self.check_linstor_nodes(args) self.check_linstor_storage_pools(args) self.check_linstor_resources(args) self.check_linstor_error_reports(args) # Determine final status return self.get_final_status(args) def get_final_status(self, args) -> int: """Determine final Nagios status and output""" if self.criticals: status = STATE_CRITICAL status_text = "CRITICAL" messages = self.criticals if args.verbose: messages.extend(self.warnings) messages.extend(self.ok_messages) elif self.warnings: status = STATE_WARNING status_text = "WARNING" messages = self.warnings if args.verbose: messages.extend(self.ok_messages) else: status = STATE_OK status_text = "OK" messages = self.ok_messages if args.verbose else ["All DRBD/LINSTOR checks passed"] # Build output output = f"{status_text} - {'; '.join(messages)}" if self.perfdata: output += " | " + " ".join(self.perfdata) print(output) return status def main(): parser = argparse.ArgumentParser( description='Comprehensive DRBD/LINSTOR Nagios monitoring plugin', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Check all DRBD and LINSTOR parameters %(prog)s --all --check-linstor # Check specific resource %(prog)s --resource r0 --all # Check only connection and replication status %(prog)s --check-connection --check-replication # Full check with performance data and verbose output %(prog)s --all --check-linstor --check-linstor-errors --performance-data --verbose # Ignore transient issues %(prog)s --all --check-linstor --ignore-transient-congestion """ ) # Resource selection parser.add_argument('-r', '--resource', help='DRBD resource name to check (default: all)') # Check options - DRBD parser.add_argument('--all', action='store_true', help='Enable all DRBD checks (recommended)') parser.add_argument('--check-role', action='store_true', help='Check resource role (Primary/Secondary)') parser.add_argument('--check-disk', action='store_true', help='Check disk state (UpToDate/Inconsistent/etc)') parser.add_argument('--check-connection', action='store_true', help='Check connection state between nodes') parser.add_argument('--check-replication', action='store_true', help='Check replication state (Established/SyncSource/etc)') parser.add_argument('--check-peer-disk', action='store_true', help='Check peer disk state') parser.add_argument('--check-suspended', action='store_true', help='Check if resource is suspended') parser.add_argument('--check-promotion', action='store_true', help='Check if resource may be promoted') parser.add_argument('--require-promotion-capability', action='store_true', help='Warn if Secondary resources cannot be promoted') parser.add_argument('--check-quorum', action='store_true', help='Check quorum status') parser.add_argument('--check-congestion', action='store_true', help='Check network congestion') parser.add_argument('--ignore-transient-congestion', action='store_true', help='Ignore transient congestion warnings') parser.add_argument('--check-client', action='store_true', help='Check if running in client mode') parser.add_argument('--check-resync-suspended', action='store_true', help='Check if resync is suspended') # LINSTOR options parser.add_argument('--check-linstor', action='store_true', help='Check LINSTOR status (nodes, storage pools, resources)') parser.add_argument('--check-linstor-errors', action='store_true', help='Check LINSTOR error reports') parser.add_argument('--linstor-error-warning', type=int, default=10, help='Warning threshold for error reports (default: 10)') parser.add_argument('--linstor-error-critical', type=int, default=50, help='Critical threshold for error reports (default: 50)') # Behavior options parser.add_argument('--require-primary', action='store_true', help='Warn if resource is not Primary') parser.add_argument('--warn-on-sync', action='store_true', help='Warn when synchronization is in progress (default: OK)') parser.add_argument('--performance-data', action='store_true', help='Include performance data for graphing') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output (show all status messages)') parser.add_argument('--version', action='version', version='%(prog)s 2.0') args = parser.parse_args() # If --all is specified, enable all DRBD checks if args.all: args.check_role = True args.check_disk = True args.check_connection = True args.check_replication = True args.check_peer_disk = True args.check_suspended = True args.check_quorum = True args.check_congestion = True args.check_client = True args.check_resync_suspended = True args.performance_data = True # If no checks specified, enable basic checks if not any([args.check_role, args.check_disk, args.check_connection, args.check_replication, args.check_peer_disk, args.check_suspended, args.check_promotion, args.check_quorum, args.check_congestion, args.check_client, args.check_resync_suspended, args.check_linstor]): args.check_role = True args.check_disk = True args.check_connection = True args.check_replication = True # Run checks monitor = DRBDMonitor() try: status = monitor.run_checks(args) sys.exit(status) except KeyboardInterrupt: print("UNKNOWN - Check interrupted") sys.exit(STATE_UNKNOWN) except Exception as e: print(f"UNKNOWN - Unexpected error: {e}") sys.exit(STATE_UNKNOWN) if __name__ == '__main__': main()