Update check_drdb_linstor.py

This commit is contained in:
gru
2026-02-02 12:47:33 +01:00
parent e2c21d9afc
commit 65ebe57d4f

View File

@@ -1,8 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Nagios plugin for comprehensive DRBD/LINSTOR monitoring Nagios plugin for comprehensive DRBD/LINSTOR monitoring
Author: @linuxiarz.pl Mateusz Gruszczyński Author: Custom Nagios DRBD Check
License: GPL v3 License: GPL v3
Version: 2.0 - Enhanced with full Linstor monitoring
""" """
import argparse import argparse
@@ -42,6 +43,20 @@ class DRBDMonitor:
except Exception as e: except Exception as e:
return 1, "", str(e) return 1, "", str(e)
def parse_linstor_json(self, stdout: str) -> List:
"""Parse Linstor JSON - handles [[...]] structure"""
try:
data = json.loads(stdout)
# Linstor returns [[...]] structure
if isinstance(data, list) and len(data) > 0:
if isinstance(data[0], list) and len(data[0]) > 0:
return data[0] # Return inner list
return data
return []
except:
return []
def parse_events2_output(self, output: str) -> Dict: def parse_events2_output(self, output: str) -> Dict:
"""Parse drbdsetup events2 --now --statistics output""" """Parse drbdsetup events2 --now --statistics output"""
data = { data = {
@@ -112,9 +127,7 @@ class DRBDMonitor:
self.criticals.append(f"Resource {res_name}: SUSPENDED") self.criticals.append(f"Resource {res_name}: SUSPENDED")
# Check promotion capability - ONLY if resource is Secondary and explicitly required # Check promotion capability - ONLY if resource is Secondary and explicitly required
# For diskless clients this is normal behavior
if args.check_promotion and may_promote == 'no' and role == 'Secondary': if args.check_promotion and may_promote == 'no' and role == 'Secondary':
# Don't warn for diskless resources (clients), only if explicitly checking
if args.require_promotion_capability: if args.require_promotion_capability:
self.warnings.append(f"Resource {res_name}: Cannot be promoted") self.warnings.append(f"Resource {res_name}: Cannot be promoted")
@@ -139,9 +152,8 @@ class DRBDMonitor:
else: else:
self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}") self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}")
# Check congestion - only warn if it's persistent or critical # Check congestion
if args.check_congestion and congested == 'yes': if args.check_congestion and congested == 'yes':
# Congestion can be temporary, so only warn instead of critical
if not args.ignore_transient_congestion: if not args.ignore_transient_congestion:
self.warnings.append(f"Connection {conn_name}: CONGESTED") self.warnings.append(f"Connection {conn_name}: CONGESTED")
@@ -192,7 +204,6 @@ class DRBDMonitor:
if args.check_replication: if args.check_replication:
if replication not in ['Established', 'Off']: if replication not in ['Established', 'Off']:
if replication in ['SyncSource', 'SyncTarget']: if replication in ['SyncSource', 'SyncTarget']:
# Synchronization in progress - warning or OK depending on config
if args.warn_on_sync: if args.warn_on_sync:
self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})") self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})")
else: else:
@@ -222,127 +233,297 @@ class DRBDMonitor:
if args.check_resync_suspended and resync_suspended == 'yes': if args.check_resync_suspended and resync_suspended == 'yes':
self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED") self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED")
def get_statistics(self, resource: str = 'all') -> Dict: def check_linstor_nodes(self, args):
"""Get DRBD statistics from drbdsetup events2 --statistics""" """Check Linstor node status and communication"""
cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource] if not args.check_linstor:
return
cmd = ['linstor', '-m', 'node', 'list']
rc, stdout, stderr = self.execute_command(cmd) rc, stdout, stderr = self.execute_command(cmd)
if rc != 0: if rc != 0:
return {} if args.verbose:
self.warnings.append(f"LINSTOR: Cannot get node list")
return
stats = {} try:
for line in stdout.strip().split('\n'): nodes = self.parse_linstor_json(stdout)
parts = line.split()
if len(parts) < 2: online_nodes = 0
offline_nodes = []
evicted_nodes = []
for node in nodes:
if not isinstance(node, dict):
continue continue
# Look for statistics in the output node_name = node.get('name', 'unknown')
for part in parts[2:]: connection_status = node.get('connection_status', 'UNKNOWN')
if ':' in part: node_type = node.get('type', 'UNKNOWN')
key, value = part.split(':', 1) flags = node.get('flags', [])
# Check for EVICTED flag
if 'EVICTED' in flags:
evicted_nodes.append(f"{node_name}({node_type})")
continue
if connection_status == 'ONLINE':
online_nodes += 1
else:
offline_nodes.append(f"{node_name}({node_type}):{connection_status}")
if offline_nodes:
self.criticals.append(f"LINSTOR: Nodes offline: {', '.join(offline_nodes)}")
if evicted_nodes:
self.warnings.append(f"LINSTOR: Nodes evicted: {', '.join(evicted_nodes)}")
if not offline_nodes and not evicted_nodes and online_nodes > 0:
self.ok_messages.append(f"LINSTOR: All {online_nodes} nodes online")
if args.performance_data:
self.perfdata.append(f"linstor_nodes_online={online_nodes}")
self.perfdata.append(f"linstor_nodes_offline={len(offline_nodes)}")
except Exception as e:
if args.verbose:
self.warnings.append(f"LINSTOR: Error checking nodes: {str(e)[:50]}")
def check_linstor_storage_pools(self, args):
"""Check Linstor storage pool status and errors"""
if not args.check_linstor:
return
cmd = ['linstor', '-m', 'storage-pool', 'list']
rc, stdout, stderr = self.execute_command(cmd)
if rc != 0:
if args.verbose:
self.warnings.append(f"LINSTOR: Cannot get storage pool list")
return
try: try:
stats[key] = int(value) storage_pools = self.parse_linstor_json(stdout)
except ValueError:
stats[key] = value
return stats pool_errors = []
pool_warnings = []
total_capacity = 0
total_free = 0
pools_checked = 0
def add_performance_data(self, device_data: Dict, peer_device_data: Dict): for pool in storage_pools:
if not isinstance(pool, dict):
continue
node_name = pool.get('node_name', 'unknown')
pool_name = pool.get('stor_pool_name', 'unknown')
provider_kind = pool.get('provider_kind', 'unknown')
# Check for errors in reports
reports = pool.get('reports', [])
for report in reports:
if not isinstance(report, dict):
continue
message = report.get('message', '')
ret_code = report.get('ret_code', 0)
# Check for specific error patterns
if 'ApiRcException' in message or 'RuntimeException' in message:
pool_errors.append(f"{node_name}:{pool_name} - ApiRcException")
elif 'Failed to query free space' in message:
pool_errors.append(f"{node_name}:{pool_name} - Cannot query free space")
elif 'query free space from storage pool' in message.lower():
pool_errors.append(f"{node_name}:{pool_name} - Storage pool query failed")
elif ret_code not in [0, None] and message:
pool_warnings.append(f"{node_name}:{pool_name} - {message[:60]}")
# Skip DISKLESS pools
if provider_kind == 'DISKLESS':
continue
# Check free space
free_capacity = pool.get('free_capacity', None)
total_capacity_pool = pool.get('total_capacity', None)
if free_capacity is not None and total_capacity_pool is not None:
pools_checked += 1
total_capacity += total_capacity_pool
total_free += free_capacity
# Calculate usage percentage
if total_capacity_pool > 0:
usage_percent = ((total_capacity_pool - free_capacity) / total_capacity_pool) * 100
if usage_percent >= 95:
pool_errors.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full")
elif usage_percent >= 90:
pool_warnings.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full")
else:
# Cannot read capacity - potential issue
if provider_kind not in ['DISKLESS']:
pool_warnings.append(f"{node_name}:{pool_name} - Cannot read capacity")
# Report errors
if pool_errors:
self.criticals.append(f"LINSTOR: Storage pool errors: {'; '.join(pool_errors[:3])}")
if pool_warnings:
self.warnings.append(f"LINSTOR: Storage pool warnings: {'; '.join(pool_warnings[:3])}")
if not pool_errors and not pool_warnings and pools_checked > 0:
self.ok_messages.append(f"LINSTOR: {pools_checked} storage pools OK")
# Performance data
if total_capacity > 0 and args.performance_data:
usage_percent = ((total_capacity - total_free) / total_capacity) * 100
self.perfdata.append(f"linstor_storage_usage={usage_percent:.2f}%")
self.perfdata.append(f"linstor_storage_free_mb={total_free // 1024}")
self.perfdata.append(f"linstor_storage_total_mb={total_capacity // 1024}")
except Exception as e:
if args.verbose:
self.warnings.append(f"LINSTOR: Error checking storage pools: {str(e)[:50]}")
def check_linstor_resources(self, args):
"""Check Linstor resource status and synchronization"""
if not args.check_linstor:
return
cmd = ['linstor', '-m', 'resource', 'list']
rc, stdout, stderr = self.execute_command(cmd)
if rc != 0:
if args.verbose:
self.warnings.append(f"LINSTOR: Cannot get resource list")
return
try:
resources = self.parse_linstor_json(stdout)
syncing_resources = []
error_resources = []
total_volumes = 0
uptodate_count = 0
for resource in resources:
if not isinstance(resource, dict):
continue
node_name = resource.get('node_name', 'unknown')
resource_name = resource.get('name', 'unknown')
# Check volumes for sync status
volumes = resource.get('vlms', [])
for volume in volumes:
if not isinstance(volume, dict):
continue
total_volumes += 1
vol_nr = volume.get('vlm_nr', 0)
# Check DRBD state
if 'layer_data_list' in volume:
for layer in volume['layer_data_list']:
if not isinstance(layer, dict):
continue
if layer.get('type') == 'DRBD':
drbd_data = layer.get('data', {})
disk_state = drbd_data.get('disk_state', '')
replication_state = drbd_data.get('repl_state', '')
# Check for UpToDate
if disk_state == 'UpToDate':
uptodate_count += 1
# Check for sync in progress
if replication_state in ['SyncSource', 'SyncTarget', 'PausedSyncS', 'PausedSyncT']:
syncing_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})")
# Check for error states
if disk_state in ['Failed', 'Inconsistent']:
error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({disk_state})")
if replication_state in ['StandAlone', 'Disconnecting', 'NetworkFailure']:
error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})")
# Report sync status
if syncing_resources:
sync_count = len(syncing_resources)
if sync_count <= 3:
self.warnings.append(f"LINSTOR: Resources syncing: {', '.join(syncing_resources)}")
else:
self.warnings.append(f"LINSTOR: {sync_count} resources syncing: {', '.join(syncing_resources[:3])}...")
if error_resources:
self.criticals.append(f"LINSTOR: Resources in error state: {', '.join(error_resources)}")
if not syncing_resources and not error_resources and total_volumes > 0:
self.ok_messages.append(f"LINSTOR: {uptodate_count}/{total_volumes} volumes UpToDate")
# Performance data
if args.performance_data:
self.perfdata.append(f"linstor_volumes_total={total_volumes}")
self.perfdata.append(f"linstor_volumes_syncing={len(syncing_resources)}")
self.perfdata.append(f"linstor_volumes_errors={len(error_resources)}")
self.perfdata.append(f"linstor_volumes_uptodate={uptodate_count}")
except Exception as e:
if args.verbose:
self.warnings.append(f"LINSTOR: Error checking resources: {str(e)[:50]}")
def check_linstor_error_reports(self, args):
"""Check for Linstor error reports in the system"""
if not args.check_linstor or not args.check_linstor_errors:
return
cmd = ['linstor', '-m', 'error-reports', 'list']
rc, stdout, stderr = self.execute_command(cmd)
if rc != 0:
return
try:
error_reports = self.parse_linstor_json(stdout)
if error_reports and len(error_reports) > 0:
report_count = len(error_reports)
# Apply threshold
if report_count > args.linstor_error_critical:
self.criticals.append(f"LINSTOR: {report_count} error reports in system")
elif report_count > args.linstor_error_warning:
self.warnings.append(f"LINSTOR: {report_count} error reports in system")
elif args.verbose:
self.ok_messages.append(f"LINSTOR: {report_count} error reports (below threshold)")
if args.performance_data:
self.perfdata.append(f"linstor_error_reports={report_count}")
except:
pass
def add_performance_data(self, device_data: Dict, peer_device_data: Dict, args):
"""Add performance data for Nagios""" """Add performance data for Nagios"""
if not args.performance_data:
return
# Count resources, devices, connections # Count resources, devices, connections
resource_count = len(device_data) resource_count = len(device_data)
self.perfdata.append(f"resources={resource_count}") self.perfdata.append(f"drbd_resources={resource_count}")
# Count by disk state # Count by disk state
uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate') uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate')
self.perfdata.append(f"uptodate_devices={uptodate}") self.perfdata.append(f"drbd_uptodate_devices={uptodate}")
# Count by replication state # Count by replication state
established = sum(1 for p in peer_device_data.values() established = sum(1 for p in peer_device_data.values()
if p.get('replication') == 'Established') if p.get('replication') == 'Established')
syncing = sum(1 for p in peer_device_data.values() syncing = sum(1 for p in peer_device_data.values()
if p.get('replication') in ['SyncSource', 'SyncTarget']) if p.get('replication') in ['SyncSource', 'SyncTarget'])
self.perfdata.append(f"established_replications={established}") self.perfdata.append(f"drbd_established_replications={established}")
self.perfdata.append(f"syncing_replications={syncing}") self.perfdata.append(f"drbd_syncing_replications={syncing}")
def check_linstor_status(self, args):
"""Check LINSTOR specific status if available"""
if not args.check_linstor:
return
# Check if linstor command is available
cmd = ['which', 'linstor']
rc, _, _ = self.execute_command(cmd)
if rc != 0:
if args.verbose:
self.ok_messages.append("LINSTOR: Command not available (optional)")
return
# Get resource list - try different output formats
cmd = ['linstor', '--machine-readable', 'resource', 'list']
rc, stdout, stderr = self.execute_command(cmd)
if rc != 0:
if args.verbose:
self.warnings.append(f"LINSTOR: Failed to get resource list: {stderr}")
return
try:
# Parse JSON output
linstor_output = json.loads(stdout)
# LINSTOR returns an array, first check if it's valid
if not isinstance(linstor_output, list) or len(linstor_output) == 0:
if args.verbose:
self.ok_messages.append("LINSTOR: No data returned")
return
# Try to extract resource data from various possible formats
linstor_resource_count = 0
linstor_volume_count = 0
# Format 1: Array of response objects with 'resources' key
for item in linstor_output:
if isinstance(item, dict):
# Try 'resources' key
resources = item.get('resources', [])
if resources and isinstance(resources, list):
for res in resources:
if isinstance(res, dict):
linstor_resource_count += 1
volumes = res.get('vlms', []) or res.get('volumes', [])
if isinstance(volumes, list):
linstor_volume_count += len(volumes)
# Format 2: Direct array of resources (older format)
if linstor_resource_count == 0:
for item in linstor_output:
if isinstance(item, dict) and 'name' in item:
linstor_resource_count += 1
volumes = item.get('vlms', []) or item.get('volumes', [])
if isinstance(volumes, list):
linstor_volume_count += len(volumes)
if linstor_resource_count > 0:
self.ok_messages.append(
f"LINSTOR: {linstor_resource_count} resources, "
f"{linstor_volume_count} volumes"
)
# Add performance data
if args.performance_data:
self.perfdata.append(f"linstor_resources={linstor_resource_count}")
self.perfdata.append(f"linstor_volumes={linstor_volume_count}")
else:
if args.verbose:
self.ok_messages.append("LINSTOR: No resources found")
except json.JSONDecodeError as e:
if args.verbose:
self.warnings.append(f"LINSTOR: JSON parse error: {str(e)[:50]}")
except Exception as e:
if args.verbose:
self.warnings.append(f"LINSTOR: Processing error: {str(e)[:50]}")
def run_checks(self, args): def run_checks(self, args):
"""Main check execution""" """Main check execution"""
@@ -363,7 +544,7 @@ class DRBDMonitor:
# Parse output # Parse output
data = self.parse_events2_output(stdout) data = self.parse_events2_output(stdout)
# Run all checks # Run DRBD checks
if data['resources']: if data['resources']:
self.check_resource_status(data['resources'], args) self.check_resource_status(data['resources'], args)
@@ -376,12 +557,15 @@ class DRBDMonitor:
if data['peer_devices']: if data['peer_devices']:
self.check_peer_device_status(data['peer_devices'], args) self.check_peer_device_status(data['peer_devices'], args)
# Add performance data # Add DRBD performance data
if args.performance_data: self.add_performance_data(data['devices'], data['peer_devices'], args)
self.add_performance_data(data['devices'], data['peer_devices'])
# Check LINSTOR if requested # Run LINSTOR checks if enabled
self.check_linstor_status(args) if args.check_linstor:
self.check_linstor_nodes(args)
self.check_linstor_storage_pools(args)
self.check_linstor_resources(args)
self.check_linstor_error_reports(args)
# Determine final status # Determine final status
return self.get_final_status(args) return self.get_final_status(args)
@@ -404,7 +588,7 @@ class DRBDMonitor:
else: else:
status = STATE_OK status = STATE_OK
status_text = "OK" status_text = "OK"
messages = self.ok_messages if args.verbose else ["All DRBD checks passed"] messages = self.ok_messages if args.verbose else ["All DRBD/LINSTOR checks passed"]
# Build output # Build output
output = f"{status_text} - {'; '.join(messages)}" output = f"{status_text} - {'; '.join(messages)}"
@@ -422,8 +606,8 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
# Check all parameters for all resources # Check all DRBD and LINSTOR parameters
%(prog)s --all %(prog)s --all --check-linstor
# Check specific resource # Check specific resource
%(prog)s --resource r0 --all %(prog)s --resource r0 --all
@@ -431,14 +615,11 @@ Examples:
# Check only connection and replication status # Check only connection and replication status
%(prog)s --check-connection --check-replication %(prog)s --check-connection --check-replication
# Check with performance data and verbose output # Full check with performance data and verbose output
%(prog)s --all --performance-data --verbose %(prog)s --all --check-linstor --check-linstor-errors --performance-data --verbose
# Check LINSTOR status as well # Ignore transient issues
%(prog)s --all --check-linstor %(prog)s --all --check-linstor --ignore-transient-congestion
# Ignore transient congestion warnings
%(prog)s --all --ignore-transient-congestion
""" """
) )
@@ -446,9 +627,9 @@ Examples:
parser.add_argument('-r', '--resource', parser.add_argument('-r', '--resource',
help='DRBD resource name to check (default: all)') help='DRBD resource name to check (default: all)')
# Check options # Check options - DRBD
parser.add_argument('--all', action='store_true', parser.add_argument('--all', action='store_true',
help='Enable all checks (recommended)') help='Enable all DRBD checks (recommended)')
parser.add_argument('--check-role', action='store_true', parser.add_argument('--check-role', action='store_true',
help='Check resource role (Primary/Secondary)') help='Check resource role (Primary/Secondary)')
@@ -469,10 +650,10 @@ Examples:
help='Check if resource is suspended') help='Check if resource is suspended')
parser.add_argument('--check-promotion', action='store_true', parser.add_argument('--check-promotion', action='store_true',
help='Check if resource may be promoted (disabled by default for diskless clients)') help='Check if resource may be promoted')
parser.add_argument('--require-promotion-capability', action='store_true', parser.add_argument('--require-promotion-capability', action='store_true',
help='Warn if Secondary resources cannot be promoted (usually OK for diskless clients)') help='Warn if Secondary resources cannot be promoted')
parser.add_argument('--check-quorum', action='store_true', parser.add_argument('--check-quorum', action='store_true',
help='Check quorum status') help='Check quorum status')
@@ -481,7 +662,7 @@ Examples:
help='Check network congestion') help='Check network congestion')
parser.add_argument('--ignore-transient-congestion', action='store_true', parser.add_argument('--ignore-transient-congestion', action='store_true',
help='Ignore transient congestion warnings (recommended for busy networks)') help='Ignore transient congestion warnings')
parser.add_argument('--check-client', action='store_true', parser.add_argument('--check-client', action='store_true',
help='Check if running in client mode') help='Check if running in client mode')
@@ -489,8 +670,18 @@ Examples:
parser.add_argument('--check-resync-suspended', action='store_true', parser.add_argument('--check-resync-suspended', action='store_true',
help='Check if resync is suspended') help='Check if resync is suspended')
# LINSTOR options
parser.add_argument('--check-linstor', action='store_true', parser.add_argument('--check-linstor', action='store_true',
help='Check LINSTOR status (requires linstor command)') help='Check LINSTOR status (nodes, storage pools, resources)')
parser.add_argument('--check-linstor-errors', action='store_true',
help='Check LINSTOR error reports')
parser.add_argument('--linstor-error-warning', type=int, default=10,
help='Warning threshold for error reports (default: 10)')
parser.add_argument('--linstor-error-critical', type=int, default=50,
help='Critical threshold for error reports (default: 50)')
# Behavior options # Behavior options
parser.add_argument('--require-primary', action='store_true', parser.add_argument('--require-primary', action='store_true',
@@ -505,11 +696,11 @@ Examples:
parser.add_argument('-v', '--verbose', action='store_true', parser.add_argument('-v', '--verbose', action='store_true',
help='Verbose output (show all status messages)') help='Verbose output (show all status messages)')
parser.add_argument('--version', action='version', version='%(prog)s 1.2') parser.add_argument('--version', action='version', version='%(prog)s 2.0')
args = parser.parse_args() args = parser.parse_args()
# If --all is specified, enable all checks EXCEPT require-promotion-capability # If --all is specified, enable all DRBD checks
if args.all: if args.all:
args.check_role = True args.check_role = True
args.check_disk = True args.check_disk = True
@@ -517,8 +708,6 @@ Examples:
args.check_replication = True args.check_replication = True
args.check_peer_disk = True args.check_peer_disk = True
args.check_suspended = True args.check_suspended = True
# Don't enable check_promotion by default - diskless clients can't promote
# args.check_promotion = True
args.check_quorum = True args.check_quorum = True
args.check_congestion = True args.check_congestion = True
args.check_client = True args.check_client = True
@@ -529,7 +718,7 @@ Examples:
if not any([args.check_role, args.check_disk, args.check_connection, if not any([args.check_role, args.check_disk, args.check_connection,
args.check_replication, args.check_peer_disk, args.check_suspended, args.check_replication, args.check_peer_disk, args.check_suspended,
args.check_promotion, args.check_quorum, args.check_congestion, args.check_promotion, args.check_quorum, args.check_congestion,
args.check_client, args.check_resync_suspended]): args.check_client, args.check_resync_suspended, args.check_linstor]):
args.check_role = True args.check_role = True
args.check_disk = True args.check_disk = True
args.check_connection = True args.check_connection = True