Update check_drdb_linstor.py
This commit is contained in:
@@ -1,8 +1,9 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Nagios plugin for comprehensive DRBD/LINSTOR monitoring
|
Nagios plugin for comprehensive DRBD/LINSTOR monitoring
|
||||||
Author: @linuxiarz.pl Mateusz Gruszczyński
|
Author: Custom Nagios DRBD Check
|
||||||
License: GPL v3
|
License: GPL v3
|
||||||
|
Version: 2.0 - Enhanced with full Linstor monitoring
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@@ -25,7 +26,7 @@ class DRBDMonitor:
|
|||||||
self.warnings = []
|
self.warnings = []
|
||||||
self.criticals = []
|
self.criticals = []
|
||||||
self.ok_messages = []
|
self.ok_messages = []
|
||||||
|
|
||||||
def execute_command(self, cmd: List[str]) -> Tuple[int, str, str]:
|
def execute_command(self, cmd: List[str]) -> Tuple[int, str, str]:
|
||||||
"""Execute system command and return returncode, stdout, stderr"""
|
"""Execute system command and return returncode, stdout, stderr"""
|
||||||
try:
|
try:
|
||||||
@@ -41,7 +42,21 @@ class DRBDMonitor:
|
|||||||
return 124, "", "Command timeout"
|
return 124, "", "Command timeout"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return 1, "", str(e)
|
return 1, "", str(e)
|
||||||
|
|
||||||
|
def parse_linstor_json(self, stdout: str) -> List:
|
||||||
|
"""Parse Linstor JSON - handles [[...]] structure"""
|
||||||
|
try:
|
||||||
|
data = json.loads(stdout)
|
||||||
|
|
||||||
|
# Linstor returns [[...]] structure
|
||||||
|
if isinstance(data, list) and len(data) > 0:
|
||||||
|
if isinstance(data[0], list) and len(data[0]) > 0:
|
||||||
|
return data[0] # Return inner list
|
||||||
|
return data
|
||||||
|
return []
|
||||||
|
except:
|
||||||
|
return []
|
||||||
|
|
||||||
def parse_events2_output(self, output: str) -> Dict:
|
def parse_events2_output(self, output: str) -> Dict:
|
||||||
"""Parse drbdsetup events2 --now --statistics output"""
|
"""Parse drbdsetup events2 --now --statistics output"""
|
||||||
data = {
|
data = {
|
||||||
@@ -50,25 +65,25 @@ class DRBDMonitor:
|
|||||||
'devices': {},
|
'devices': {},
|
||||||
'peer_devices': {}
|
'peer_devices': {}
|
||||||
}
|
}
|
||||||
|
|
||||||
for line in output.strip().split('\n'):
|
for line in output.strip().split('\n'):
|
||||||
if not line or line.startswith('#'):
|
if not line or line.startswith('#'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
parts = line.split()
|
parts = line.split()
|
||||||
if len(parts) < 2:
|
if len(parts) < 2:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
event_type = parts[0] # exists, create, change, destroy
|
event_type = parts[0] # exists, create, change, destroy
|
||||||
object_type = parts[1] # resource, connection, device, peer-device
|
object_type = parts[1] # resource, connection, device, peer-device
|
||||||
|
|
||||||
# Parse key:value pairs
|
# Parse key:value pairs
|
||||||
props = {}
|
props = {}
|
||||||
for part in parts[2:]:
|
for part in parts[2:]:
|
||||||
if ':' in part:
|
if ':' in part:
|
||||||
key, value = part.split(':', 1)
|
key, value = part.split(':', 1)
|
||||||
props[key] = value
|
props[key] = value
|
||||||
|
|
||||||
# Store data by object type
|
# Store data by object type
|
||||||
if object_type == 'resource':
|
if object_type == 'resource':
|
||||||
res_name = props.get('name', 'unknown')
|
res_name = props.get('name', 'unknown')
|
||||||
@@ -86,16 +101,16 @@ class DRBDMonitor:
|
|||||||
volume = props.get('volume', '0')
|
volume = props.get('volume', '0')
|
||||||
key = f"{peer_name}:{volume}"
|
key = f"{peer_name}:{volume}"
|
||||||
data['peer_devices'][key] = props
|
data['peer_devices'][key] = props
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def check_resource_status(self, resource_data: Dict, args):
|
def check_resource_status(self, resource_data: Dict, args):
|
||||||
"""Check resource role and status"""
|
"""Check resource role and status"""
|
||||||
for res_name, props in resource_data.items():
|
for res_name, props in resource_data.items():
|
||||||
role = props.get('role', 'Unknown')
|
role = props.get('role', 'Unknown')
|
||||||
suspended = props.get('suspended', 'no')
|
suspended = props.get('suspended', 'no')
|
||||||
may_promote = props.get('may_promote', 'no')
|
may_promote = props.get('may_promote', 'no')
|
||||||
|
|
||||||
# Check role
|
# Check role
|
||||||
if args.check_role:
|
if args.check_role:
|
||||||
if role not in ['Primary', 'Secondary']:
|
if role not in ['Primary', 'Secondary']:
|
||||||
@@ -106,18 +121,16 @@ class DRBDMonitor:
|
|||||||
self.warnings.append(f"Resource {res_name}: Role is Secondary, expected Primary")
|
self.warnings.append(f"Resource {res_name}: Role is Secondary, expected Primary")
|
||||||
else:
|
else:
|
||||||
self.ok_messages.append(f"Resource {res_name}: Role={role}")
|
self.ok_messages.append(f"Resource {res_name}: Role={role}")
|
||||||
|
|
||||||
# Check suspended state
|
# Check suspended state
|
||||||
if args.check_suspended and suspended == 'yes':
|
if args.check_suspended and suspended == 'yes':
|
||||||
self.criticals.append(f"Resource {res_name}: SUSPENDED")
|
self.criticals.append(f"Resource {res_name}: SUSPENDED")
|
||||||
|
|
||||||
# Check promotion capability - ONLY if resource is Secondary and explicitly required
|
# Check promotion capability - ONLY if resource is Secondary and explicitly required
|
||||||
# For diskless clients this is normal behavior
|
|
||||||
if args.check_promotion and may_promote == 'no' and role == 'Secondary':
|
if args.check_promotion and may_promote == 'no' and role == 'Secondary':
|
||||||
# Don't warn for diskless resources (clients), only if explicitly checking
|
|
||||||
if args.require_promotion_capability:
|
if args.require_promotion_capability:
|
||||||
self.warnings.append(f"Resource {res_name}: Cannot be promoted")
|
self.warnings.append(f"Resource {res_name}: Cannot be promoted")
|
||||||
|
|
||||||
def check_connection_status(self, connection_data: Dict, args):
|
def check_connection_status(self, connection_data: Dict, args):
|
||||||
"""Check connection state between peers"""
|
"""Check connection state between peers"""
|
||||||
for conn_name, props in connection_data.items():
|
for conn_name, props in connection_data.items():
|
||||||
@@ -125,11 +138,11 @@ class DRBDMonitor:
|
|||||||
role = props.get('role', 'Unknown')
|
role = props.get('role', 'Unknown')
|
||||||
congested = props.get('congested', 'no')
|
congested = props.get('congested', 'no')
|
||||||
peer_node_id = props.get('peer-node-id', 'unknown')
|
peer_node_id = props.get('peer-node-id', 'unknown')
|
||||||
|
|
||||||
# Check connection state
|
# Check connection state
|
||||||
if args.check_connection:
|
if args.check_connection:
|
||||||
if connection not in ['Connected', 'StandAlone']:
|
if connection not in ['Connected', 'StandAlone']:
|
||||||
if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure',
|
if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure',
|
||||||
'ProtocolError', 'TearDown', 'Unconnected', 'Disconnecting']:
|
'ProtocolError', 'TearDown', 'Unconnected', 'Disconnecting']:
|
||||||
self.criticals.append(f"Connection {conn_name}: State={connection}")
|
self.criticals.append(f"Connection {conn_name}: State={connection}")
|
||||||
elif connection in ['WFConnection', 'WFReportParams']:
|
elif connection in ['WFConnection', 'WFReportParams']:
|
||||||
@@ -138,13 +151,12 @@ class DRBDMonitor:
|
|||||||
self.criticals.append(f"Connection {conn_name}: Unknown state {connection}")
|
self.criticals.append(f"Connection {conn_name}: Unknown state {connection}")
|
||||||
else:
|
else:
|
||||||
self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}")
|
self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}")
|
||||||
|
|
||||||
# Check congestion - only warn if it's persistent or critical
|
# Check congestion
|
||||||
if args.check_congestion and congested == 'yes':
|
if args.check_congestion and congested == 'yes':
|
||||||
# Congestion can be temporary, so only warn instead of critical
|
|
||||||
if not args.ignore_transient_congestion:
|
if not args.ignore_transient_congestion:
|
||||||
self.warnings.append(f"Connection {conn_name}: CONGESTED")
|
self.warnings.append(f"Connection {conn_name}: CONGESTED")
|
||||||
|
|
||||||
def check_device_status(self, device_data: Dict, args):
|
def check_device_status(self, device_data: Dict, args):
|
||||||
"""Check device/volume disk state"""
|
"""Check device/volume disk state"""
|
||||||
for dev_key, props in device_data.items():
|
for dev_key, props in device_data.items():
|
||||||
@@ -154,7 +166,7 @@ class DRBDMonitor:
|
|||||||
minor = props.get('minor', 'unknown')
|
minor = props.get('minor', 'unknown')
|
||||||
client = props.get('client', 'no')
|
client = props.get('client', 'no')
|
||||||
quorum = props.get('quorum', 'yes')
|
quorum = props.get('quorum', 'yes')
|
||||||
|
|
||||||
# Check disk state
|
# Check disk state
|
||||||
if args.check_disk:
|
if args.check_disk:
|
||||||
if disk not in ['UpToDate', 'Diskless']:
|
if disk not in ['UpToDate', 'Diskless']:
|
||||||
@@ -168,15 +180,15 @@ class DRBDMonitor:
|
|||||||
self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}")
|
self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}")
|
||||||
else:
|
else:
|
||||||
self.ok_messages.append(f"Device {dev_name} vol:{volume} (minor:{minor}): Disk={disk}")
|
self.ok_messages.append(f"Device {dev_name} vol:{volume} (minor:{minor}): Disk={disk}")
|
||||||
|
|
||||||
# Check quorum - but not for diskless clients
|
# Check quorum - but not for diskless clients
|
||||||
if args.check_quorum and quorum == 'no' and disk != 'Diskless':
|
if args.check_quorum and quorum == 'no' and disk != 'Diskless':
|
||||||
self.criticals.append(f"Device {dev_name} vol:{volume}: NO QUORUM")
|
self.criticals.append(f"Device {dev_name} vol:{volume}: NO QUORUM")
|
||||||
|
|
||||||
# Check client mode
|
# Check client mode
|
||||||
if args.check_client and client == 'yes':
|
if args.check_client and client == 'yes':
|
||||||
self.ok_messages.append(f"Device {dev_name} vol:{volume}: Running in client mode")
|
self.ok_messages.append(f"Device {dev_name} vol:{volume}: Running in client mode")
|
||||||
|
|
||||||
def check_peer_device_status(self, peer_device_data: Dict, args):
|
def check_peer_device_status(self, peer_device_data: Dict, args):
|
||||||
"""Check peer device replication state"""
|
"""Check peer device replication state"""
|
||||||
for peer_key, props in peer_device_data.items():
|
for peer_key, props in peer_device_data.items():
|
||||||
@@ -187,12 +199,11 @@ class DRBDMonitor:
|
|||||||
resync_suspended = props.get('resync-suspended', 'no')
|
resync_suspended = props.get('resync-suspended', 'no')
|
||||||
peer_client = props.get('peer-client', 'no')
|
peer_client = props.get('peer-client', 'no')
|
||||||
peer_node_id = props.get('peer-node-id', 'unknown')
|
peer_node_id = props.get('peer-node-id', 'unknown')
|
||||||
|
|
||||||
# Check replication state
|
# Check replication state
|
||||||
if args.check_replication:
|
if args.check_replication:
|
||||||
if replication not in ['Established', 'Off']:
|
if replication not in ['Established', 'Off']:
|
||||||
if replication in ['SyncSource', 'SyncTarget']:
|
if replication in ['SyncSource', 'SyncTarget']:
|
||||||
# Synchronization in progress - warning or OK depending on config
|
|
||||||
if args.warn_on_sync:
|
if args.warn_on_sync:
|
||||||
self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})")
|
self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})")
|
||||||
else:
|
else:
|
||||||
@@ -209,7 +220,7 @@ class DRBDMonitor:
|
|||||||
self.criticals.append(f"Peer {peer_name} vol:{volume}: Replication={replication}")
|
self.criticals.append(f"Peer {peer_name} vol:{volume}: Replication={replication}")
|
||||||
else:
|
else:
|
||||||
self.ok_messages.append(f"Peer {peer_name} (node-{peer_node_id}) vol:{volume}: {replication}")
|
self.ok_messages.append(f"Peer {peer_name} (node-{peer_node_id}) vol:{volume}: {replication}")
|
||||||
|
|
||||||
# Check peer disk state
|
# Check peer disk state
|
||||||
if args.check_peer_disk:
|
if args.check_peer_disk:
|
||||||
if peer_disk not in ['UpToDate', 'Diskless', 'DUnknown']:
|
if peer_disk not in ['UpToDate', 'Diskless', 'DUnknown']:
|
||||||
@@ -217,175 +228,348 @@ class DRBDMonitor:
|
|||||||
self.criticals.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
|
self.criticals.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
|
||||||
else:
|
else:
|
||||||
self.warnings.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
|
self.warnings.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
|
||||||
|
|
||||||
# Check resync suspended
|
# Check resync suspended
|
||||||
if args.check_resync_suspended and resync_suspended == 'yes':
|
if args.check_resync_suspended and resync_suspended == 'yes':
|
||||||
self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED")
|
self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED")
|
||||||
|
|
||||||
def get_statistics(self, resource: str = 'all') -> Dict:
|
def check_linstor_nodes(self, args):
|
||||||
"""Get DRBD statistics from drbdsetup events2 --statistics"""
|
"""Check Linstor node status and communication"""
|
||||||
cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource]
|
|
||||||
rc, stdout, stderr = self.execute_command(cmd)
|
|
||||||
|
|
||||||
if rc != 0:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
stats = {}
|
|
||||||
for line in stdout.strip().split('\n'):
|
|
||||||
parts = line.split()
|
|
||||||
if len(parts) < 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Look for statistics in the output
|
|
||||||
for part in parts[2:]:
|
|
||||||
if ':' in part:
|
|
||||||
key, value = part.split(':', 1)
|
|
||||||
try:
|
|
||||||
stats[key] = int(value)
|
|
||||||
except ValueError:
|
|
||||||
stats[key] = value
|
|
||||||
|
|
||||||
return stats
|
|
||||||
|
|
||||||
def add_performance_data(self, device_data: Dict, peer_device_data: Dict):
|
|
||||||
"""Add performance data for Nagios"""
|
|
||||||
# Count resources, devices, connections
|
|
||||||
resource_count = len(device_data)
|
|
||||||
self.perfdata.append(f"resources={resource_count}")
|
|
||||||
|
|
||||||
# Count by disk state
|
|
||||||
uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate')
|
|
||||||
self.perfdata.append(f"uptodate_devices={uptodate}")
|
|
||||||
|
|
||||||
# Count by replication state
|
|
||||||
established = sum(1 for p in peer_device_data.values()
|
|
||||||
if p.get('replication') == 'Established')
|
|
||||||
syncing = sum(1 for p in peer_device_data.values()
|
|
||||||
if p.get('replication') in ['SyncSource', 'SyncTarget'])
|
|
||||||
self.perfdata.append(f"established_replications={established}")
|
|
||||||
self.perfdata.append(f"syncing_replications={syncing}")
|
|
||||||
|
|
||||||
def check_linstor_status(self, args):
|
|
||||||
"""Check LINSTOR specific status if available"""
|
|
||||||
if not args.check_linstor:
|
if not args.check_linstor:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Check if linstor command is available
|
cmd = ['linstor', '-m', 'node', 'list']
|
||||||
cmd = ['which', 'linstor']
|
|
||||||
rc, _, _ = self.execute_command(cmd)
|
|
||||||
if rc != 0:
|
|
||||||
if args.verbose:
|
|
||||||
self.ok_messages.append("LINSTOR: Command not available (optional)")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Get resource list - try different output formats
|
|
||||||
cmd = ['linstor', '--machine-readable', 'resource', 'list']
|
|
||||||
rc, stdout, stderr = self.execute_command(cmd)
|
rc, stdout, stderr = self.execute_command(cmd)
|
||||||
|
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
self.warnings.append(f"LINSTOR: Failed to get resource list: {stderr}")
|
self.warnings.append(f"LINSTOR: Cannot get node list")
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Parse JSON output
|
nodes = self.parse_linstor_json(stdout)
|
||||||
linstor_output = json.loads(stdout)
|
|
||||||
|
online_nodes = 0
|
||||||
# LINSTOR returns an array, first check if it's valid
|
offline_nodes = []
|
||||||
if not isinstance(linstor_output, list) or len(linstor_output) == 0:
|
evicted_nodes = []
|
||||||
if args.verbose:
|
|
||||||
self.ok_messages.append("LINSTOR: No data returned")
|
for node in nodes:
|
||||||
return
|
if not isinstance(node, dict):
|
||||||
|
continue
|
||||||
# Try to extract resource data from various possible formats
|
|
||||||
linstor_resource_count = 0
|
node_name = node.get('name', 'unknown')
|
||||||
linstor_volume_count = 0
|
connection_status = node.get('connection_status', 'UNKNOWN')
|
||||||
|
node_type = node.get('type', 'UNKNOWN')
|
||||||
# Format 1: Array of response objects with 'resources' key
|
flags = node.get('flags', [])
|
||||||
for item in linstor_output:
|
|
||||||
if isinstance(item, dict):
|
# Check for EVICTED flag
|
||||||
# Try 'resources' key
|
if 'EVICTED' in flags:
|
||||||
resources = item.get('resources', [])
|
evicted_nodes.append(f"{node_name}({node_type})")
|
||||||
if resources and isinstance(resources, list):
|
continue
|
||||||
for res in resources:
|
|
||||||
if isinstance(res, dict):
|
if connection_status == 'ONLINE':
|
||||||
linstor_resource_count += 1
|
online_nodes += 1
|
||||||
volumes = res.get('vlms', []) or res.get('volumes', [])
|
else:
|
||||||
if isinstance(volumes, list):
|
offline_nodes.append(f"{node_name}({node_type}):{connection_status}")
|
||||||
linstor_volume_count += len(volumes)
|
|
||||||
|
if offline_nodes:
|
||||||
# Format 2: Direct array of resources (older format)
|
self.criticals.append(f"LINSTOR: Nodes offline: {', '.join(offline_nodes)}")
|
||||||
if linstor_resource_count == 0:
|
|
||||||
for item in linstor_output:
|
if evicted_nodes:
|
||||||
if isinstance(item, dict) and 'name' in item:
|
self.warnings.append(f"LINSTOR: Nodes evicted: {', '.join(evicted_nodes)}")
|
||||||
linstor_resource_count += 1
|
|
||||||
volumes = item.get('vlms', []) or item.get('volumes', [])
|
if not offline_nodes and not evicted_nodes and online_nodes > 0:
|
||||||
if isinstance(volumes, list):
|
self.ok_messages.append(f"LINSTOR: All {online_nodes} nodes online")
|
||||||
linstor_volume_count += len(volumes)
|
|
||||||
|
if args.performance_data:
|
||||||
if linstor_resource_count > 0:
|
self.perfdata.append(f"linstor_nodes_online={online_nodes}")
|
||||||
self.ok_messages.append(
|
self.perfdata.append(f"linstor_nodes_offline={len(offline_nodes)}")
|
||||||
f"LINSTOR: {linstor_resource_count} resources, "
|
|
||||||
f"{linstor_volume_count} volumes"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add performance data
|
|
||||||
if args.performance_data:
|
|
||||||
self.perfdata.append(f"linstor_resources={linstor_resource_count}")
|
|
||||||
self.perfdata.append(f"linstor_volumes={linstor_volume_count}")
|
|
||||||
else:
|
|
||||||
if args.verbose:
|
|
||||||
self.ok_messages.append("LINSTOR: No resources found")
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
if args.verbose:
|
|
||||||
self.warnings.append(f"LINSTOR: JSON parse error: {str(e)[:50]}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
self.warnings.append(f"LINSTOR: Processing error: {str(e)[:50]}")
|
self.warnings.append(f"LINSTOR: Error checking nodes: {str(e)[:50]}")
|
||||||
|
|
||||||
|
def check_linstor_storage_pools(self, args):
|
||||||
|
"""Check Linstor storage pool status and errors"""
|
||||||
|
if not args.check_linstor:
|
||||||
|
return
|
||||||
|
|
||||||
|
cmd = ['linstor', '-m', 'storage-pool', 'list']
|
||||||
|
rc, stdout, stderr = self.execute_command(cmd)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
if args.verbose:
|
||||||
|
self.warnings.append(f"LINSTOR: Cannot get storage pool list")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
storage_pools = self.parse_linstor_json(stdout)
|
||||||
|
|
||||||
|
pool_errors = []
|
||||||
|
pool_warnings = []
|
||||||
|
total_capacity = 0
|
||||||
|
total_free = 0
|
||||||
|
pools_checked = 0
|
||||||
|
|
||||||
|
for pool in storage_pools:
|
||||||
|
if not isinstance(pool, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
node_name = pool.get('node_name', 'unknown')
|
||||||
|
pool_name = pool.get('stor_pool_name', 'unknown')
|
||||||
|
provider_kind = pool.get('provider_kind', 'unknown')
|
||||||
|
|
||||||
|
# Check for errors in reports
|
||||||
|
reports = pool.get('reports', [])
|
||||||
|
for report in reports:
|
||||||
|
if not isinstance(report, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
message = report.get('message', '')
|
||||||
|
ret_code = report.get('ret_code', 0)
|
||||||
|
|
||||||
|
# Check for specific error patterns
|
||||||
|
if 'ApiRcException' in message or 'RuntimeException' in message:
|
||||||
|
pool_errors.append(f"{node_name}:{pool_name} - ApiRcException")
|
||||||
|
elif 'Failed to query free space' in message:
|
||||||
|
pool_errors.append(f"{node_name}:{pool_name} - Cannot query free space")
|
||||||
|
elif 'query free space from storage pool' in message.lower():
|
||||||
|
pool_errors.append(f"{node_name}:{pool_name} - Storage pool query failed")
|
||||||
|
elif ret_code not in [0, None] and message:
|
||||||
|
pool_warnings.append(f"{node_name}:{pool_name} - {message[:60]}")
|
||||||
|
|
||||||
|
# Skip DISKLESS pools
|
||||||
|
if provider_kind == 'DISKLESS':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check free space
|
||||||
|
free_capacity = pool.get('free_capacity', None)
|
||||||
|
total_capacity_pool = pool.get('total_capacity', None)
|
||||||
|
|
||||||
|
if free_capacity is not None and total_capacity_pool is not None:
|
||||||
|
pools_checked += 1
|
||||||
|
total_capacity += total_capacity_pool
|
||||||
|
total_free += free_capacity
|
||||||
|
|
||||||
|
# Calculate usage percentage
|
||||||
|
if total_capacity_pool > 0:
|
||||||
|
usage_percent = ((total_capacity_pool - free_capacity) / total_capacity_pool) * 100
|
||||||
|
|
||||||
|
if usage_percent >= 95:
|
||||||
|
pool_errors.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full")
|
||||||
|
elif usage_percent >= 90:
|
||||||
|
pool_warnings.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full")
|
||||||
|
else:
|
||||||
|
# Cannot read capacity - potential issue
|
||||||
|
if provider_kind not in ['DISKLESS']:
|
||||||
|
pool_warnings.append(f"{node_name}:{pool_name} - Cannot read capacity")
|
||||||
|
|
||||||
|
# Report errors
|
||||||
|
if pool_errors:
|
||||||
|
self.criticals.append(f"LINSTOR: Storage pool errors: {'; '.join(pool_errors[:3])}")
|
||||||
|
|
||||||
|
if pool_warnings:
|
||||||
|
self.warnings.append(f"LINSTOR: Storage pool warnings: {'; '.join(pool_warnings[:3])}")
|
||||||
|
|
||||||
|
if not pool_errors and not pool_warnings and pools_checked > 0:
|
||||||
|
self.ok_messages.append(f"LINSTOR: {pools_checked} storage pools OK")
|
||||||
|
|
||||||
|
# Performance data
|
||||||
|
if total_capacity > 0 and args.performance_data:
|
||||||
|
usage_percent = ((total_capacity - total_free) / total_capacity) * 100
|
||||||
|
self.perfdata.append(f"linstor_storage_usage={usage_percent:.2f}%")
|
||||||
|
self.perfdata.append(f"linstor_storage_free_mb={total_free // 1024}")
|
||||||
|
self.perfdata.append(f"linstor_storage_total_mb={total_capacity // 1024}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if args.verbose:
|
||||||
|
self.warnings.append(f"LINSTOR: Error checking storage pools: {str(e)[:50]}")
|
||||||
|
|
||||||
|
def check_linstor_resources(self, args):
|
||||||
|
"""Check Linstor resource status and synchronization"""
|
||||||
|
if not args.check_linstor:
|
||||||
|
return
|
||||||
|
|
||||||
|
cmd = ['linstor', '-m', 'resource', 'list']
|
||||||
|
rc, stdout, stderr = self.execute_command(cmd)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
if args.verbose:
|
||||||
|
self.warnings.append(f"LINSTOR: Cannot get resource list")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
resources = self.parse_linstor_json(stdout)
|
||||||
|
|
||||||
|
syncing_resources = []
|
||||||
|
error_resources = []
|
||||||
|
total_volumes = 0
|
||||||
|
uptodate_count = 0
|
||||||
|
|
||||||
|
for resource in resources:
|
||||||
|
if not isinstance(resource, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
node_name = resource.get('node_name', 'unknown')
|
||||||
|
resource_name = resource.get('name', 'unknown')
|
||||||
|
|
||||||
|
# Check volumes for sync status
|
||||||
|
volumes = resource.get('vlms', [])
|
||||||
|
for volume in volumes:
|
||||||
|
if not isinstance(volume, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
total_volumes += 1
|
||||||
|
vol_nr = volume.get('vlm_nr', 0)
|
||||||
|
|
||||||
|
# Check DRBD state
|
||||||
|
if 'layer_data_list' in volume:
|
||||||
|
for layer in volume['layer_data_list']:
|
||||||
|
if not isinstance(layer, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if layer.get('type') == 'DRBD':
|
||||||
|
drbd_data = layer.get('data', {})
|
||||||
|
disk_state = drbd_data.get('disk_state', '')
|
||||||
|
replication_state = drbd_data.get('repl_state', '')
|
||||||
|
|
||||||
|
# Check for UpToDate
|
||||||
|
if disk_state == 'UpToDate':
|
||||||
|
uptodate_count += 1
|
||||||
|
|
||||||
|
# Check for sync in progress
|
||||||
|
if replication_state in ['SyncSource', 'SyncTarget', 'PausedSyncS', 'PausedSyncT']:
|
||||||
|
syncing_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})")
|
||||||
|
|
||||||
|
# Check for error states
|
||||||
|
if disk_state in ['Failed', 'Inconsistent']:
|
||||||
|
error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({disk_state})")
|
||||||
|
|
||||||
|
if replication_state in ['StandAlone', 'Disconnecting', 'NetworkFailure']:
|
||||||
|
error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})")
|
||||||
|
|
||||||
|
# Report sync status
|
||||||
|
if syncing_resources:
|
||||||
|
sync_count = len(syncing_resources)
|
||||||
|
if sync_count <= 3:
|
||||||
|
self.warnings.append(f"LINSTOR: Resources syncing: {', '.join(syncing_resources)}")
|
||||||
|
else:
|
||||||
|
self.warnings.append(f"LINSTOR: {sync_count} resources syncing: {', '.join(syncing_resources[:3])}...")
|
||||||
|
|
||||||
|
if error_resources:
|
||||||
|
self.criticals.append(f"LINSTOR: Resources in error state: {', '.join(error_resources)}")
|
||||||
|
|
||||||
|
if not syncing_resources and not error_resources and total_volumes > 0:
|
||||||
|
self.ok_messages.append(f"LINSTOR: {uptodate_count}/{total_volumes} volumes UpToDate")
|
||||||
|
|
||||||
|
# Performance data
|
||||||
|
if args.performance_data:
|
||||||
|
self.perfdata.append(f"linstor_volumes_total={total_volumes}")
|
||||||
|
self.perfdata.append(f"linstor_volumes_syncing={len(syncing_resources)}")
|
||||||
|
self.perfdata.append(f"linstor_volumes_errors={len(error_resources)}")
|
||||||
|
self.perfdata.append(f"linstor_volumes_uptodate={uptodate_count}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if args.verbose:
|
||||||
|
self.warnings.append(f"LINSTOR: Error checking resources: {str(e)[:50]}")
|
||||||
|
|
||||||
|
def check_linstor_error_reports(self, args):
|
||||||
|
"""Check for Linstor error reports in the system"""
|
||||||
|
if not args.check_linstor or not args.check_linstor_errors:
|
||||||
|
return
|
||||||
|
|
||||||
|
cmd = ['linstor', '-m', 'error-reports', 'list']
|
||||||
|
rc, stdout, stderr = self.execute_command(cmd)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
error_reports = self.parse_linstor_json(stdout)
|
||||||
|
|
||||||
|
if error_reports and len(error_reports) > 0:
|
||||||
|
report_count = len(error_reports)
|
||||||
|
|
||||||
|
# Apply threshold
|
||||||
|
if report_count > args.linstor_error_critical:
|
||||||
|
self.criticals.append(f"LINSTOR: {report_count} error reports in system")
|
||||||
|
elif report_count > args.linstor_error_warning:
|
||||||
|
self.warnings.append(f"LINSTOR: {report_count} error reports in system")
|
||||||
|
elif args.verbose:
|
||||||
|
self.ok_messages.append(f"LINSTOR: {report_count} error reports (below threshold)")
|
||||||
|
|
||||||
|
if args.performance_data:
|
||||||
|
self.perfdata.append(f"linstor_error_reports={report_count}")
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_performance_data(self, device_data: Dict, peer_device_data: Dict, args):
|
||||||
|
"""Add performance data for Nagios"""
|
||||||
|
if not args.performance_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Count resources, devices, connections
|
||||||
|
resource_count = len(device_data)
|
||||||
|
self.perfdata.append(f"drbd_resources={resource_count}")
|
||||||
|
|
||||||
|
# Count by disk state
|
||||||
|
uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate')
|
||||||
|
self.perfdata.append(f"drbd_uptodate_devices={uptodate}")
|
||||||
|
|
||||||
|
# Count by replication state
|
||||||
|
established = sum(1 for p in peer_device_data.values()
|
||||||
|
if p.get('replication') == 'Established')
|
||||||
|
syncing = sum(1 for p in peer_device_data.values()
|
||||||
|
if p.get('replication') in ['SyncSource', 'SyncTarget'])
|
||||||
|
self.perfdata.append(f"drbd_established_replications={established}")
|
||||||
|
self.perfdata.append(f"drbd_syncing_replications={syncing}")
|
||||||
|
|
||||||
def run_checks(self, args):
|
def run_checks(self, args):
|
||||||
"""Main check execution"""
|
"""Main check execution"""
|
||||||
# Get DRBD events2 output
|
# Get DRBD events2 output
|
||||||
resource_filter = args.resource if args.resource else 'all'
|
resource_filter = args.resource if args.resource else 'all'
|
||||||
cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource_filter]
|
cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource_filter]
|
||||||
|
|
||||||
rc, stdout, stderr = self.execute_command(cmd)
|
rc, stdout, stderr = self.execute_command(cmd)
|
||||||
|
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
print(f"CRITICAL - Failed to execute drbdsetup: {stderr}")
|
print(f"CRITICAL - Failed to execute drbdsetup: {stderr}")
|
||||||
sys.exit(STATE_CRITICAL)
|
sys.exit(STATE_CRITICAL)
|
||||||
|
|
||||||
if not stdout.strip():
|
if not stdout.strip():
|
||||||
print("CRITICAL - No DRBD resources found")
|
print("CRITICAL - No DRBD resources found")
|
||||||
sys.exit(STATE_CRITICAL)
|
sys.exit(STATE_CRITICAL)
|
||||||
|
|
||||||
# Parse output
|
# Parse output
|
||||||
data = self.parse_events2_output(stdout)
|
data = self.parse_events2_output(stdout)
|
||||||
|
|
||||||
# Run all checks
|
# Run DRBD checks
|
||||||
if data['resources']:
|
if data['resources']:
|
||||||
self.check_resource_status(data['resources'], args)
|
self.check_resource_status(data['resources'], args)
|
||||||
|
|
||||||
if data['connections']:
|
if data['connections']:
|
||||||
self.check_connection_status(data['connections'], args)
|
self.check_connection_status(data['connections'], args)
|
||||||
|
|
||||||
if data['devices']:
|
if data['devices']:
|
||||||
self.check_device_status(data['devices'], args)
|
self.check_device_status(data['devices'], args)
|
||||||
|
|
||||||
if data['peer_devices']:
|
if data['peer_devices']:
|
||||||
self.check_peer_device_status(data['peer_devices'], args)
|
self.check_peer_device_status(data['peer_devices'], args)
|
||||||
|
|
||||||
# Add performance data
|
# Add DRBD performance data
|
||||||
if args.performance_data:
|
self.add_performance_data(data['devices'], data['peer_devices'], args)
|
||||||
self.add_performance_data(data['devices'], data['peer_devices'])
|
|
||||||
|
# Run LINSTOR checks if enabled
|
||||||
# Check LINSTOR if requested
|
if args.check_linstor:
|
||||||
self.check_linstor_status(args)
|
self.check_linstor_nodes(args)
|
||||||
|
self.check_linstor_storage_pools(args)
|
||||||
|
self.check_linstor_resources(args)
|
||||||
|
self.check_linstor_error_reports(args)
|
||||||
|
|
||||||
# Determine final status
|
# Determine final status
|
||||||
return self.get_final_status(args)
|
return self.get_final_status(args)
|
||||||
|
|
||||||
def get_final_status(self, args) -> int:
|
def get_final_status(self, args) -> int:
|
||||||
"""Determine final Nagios status and output"""
|
"""Determine final Nagios status and output"""
|
||||||
if self.criticals:
|
if self.criticals:
|
||||||
@@ -404,14 +588,14 @@ class DRBDMonitor:
|
|||||||
else:
|
else:
|
||||||
status = STATE_OK
|
status = STATE_OK
|
||||||
status_text = "OK"
|
status_text = "OK"
|
||||||
messages = self.ok_messages if args.verbose else ["All DRBD checks passed"]
|
messages = self.ok_messages if args.verbose else ["All DRBD/LINSTOR checks passed"]
|
||||||
|
|
||||||
# Build output
|
# Build output
|
||||||
output = f"{status_text} - {'; '.join(messages)}"
|
output = f"{status_text} - {'; '.join(messages)}"
|
||||||
|
|
||||||
if self.perfdata:
|
if self.perfdata:
|
||||||
output += " | " + " ".join(self.perfdata)
|
output += " | " + " ".join(self.perfdata)
|
||||||
|
|
||||||
print(output)
|
print(output)
|
||||||
return status
|
return status
|
||||||
|
|
||||||
@@ -422,94 +606,101 @@ def main():
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
# Check all parameters for all resources
|
# Check all DRBD and LINSTOR parameters
|
||||||
%(prog)s --all
|
%(prog)s --all --check-linstor
|
||||||
|
|
||||||
# Check specific resource
|
# Check specific resource
|
||||||
%(prog)s --resource r0 --all
|
%(prog)s --resource r0 --all
|
||||||
|
|
||||||
# Check only connection and replication status
|
# Check only connection and replication status
|
||||||
%(prog)s --check-connection --check-replication
|
%(prog)s --check-connection --check-replication
|
||||||
|
|
||||||
# Check with performance data and verbose output
|
# Full check with performance data and verbose output
|
||||||
%(prog)s --all --performance-data --verbose
|
%(prog)s --all --check-linstor --check-linstor-errors --performance-data --verbose
|
||||||
|
|
||||||
# Check LINSTOR status as well
|
# Ignore transient issues
|
||||||
%(prog)s --all --check-linstor
|
%(prog)s --all --check-linstor --ignore-transient-congestion
|
||||||
|
|
||||||
# Ignore transient congestion warnings
|
|
||||||
%(prog)s --all --ignore-transient-congestion
|
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
# Resource selection
|
# Resource selection
|
||||||
parser.add_argument('-r', '--resource',
|
parser.add_argument('-r', '--resource',
|
||||||
help='DRBD resource name to check (default: all)')
|
help='DRBD resource name to check (default: all)')
|
||||||
|
|
||||||
# Check options
|
# Check options - DRBD
|
||||||
parser.add_argument('--all', action='store_true',
|
parser.add_argument('--all', action='store_true',
|
||||||
help='Enable all checks (recommended)')
|
help='Enable all DRBD checks (recommended)')
|
||||||
|
|
||||||
parser.add_argument('--check-role', action='store_true',
|
parser.add_argument('--check-role', action='store_true',
|
||||||
help='Check resource role (Primary/Secondary)')
|
help='Check resource role (Primary/Secondary)')
|
||||||
|
|
||||||
parser.add_argument('--check-disk', action='store_true',
|
parser.add_argument('--check-disk', action='store_true',
|
||||||
help='Check disk state (UpToDate/Inconsistent/etc)')
|
help='Check disk state (UpToDate/Inconsistent/etc)')
|
||||||
|
|
||||||
parser.add_argument('--check-connection', action='store_true',
|
parser.add_argument('--check-connection', action='store_true',
|
||||||
help='Check connection state between nodes')
|
help='Check connection state between nodes')
|
||||||
|
|
||||||
parser.add_argument('--check-replication', action='store_true',
|
parser.add_argument('--check-replication', action='store_true',
|
||||||
help='Check replication state (Established/SyncSource/etc)')
|
help='Check replication state (Established/SyncSource/etc)')
|
||||||
|
|
||||||
parser.add_argument('--check-peer-disk', action='store_true',
|
parser.add_argument('--check-peer-disk', action='store_true',
|
||||||
help='Check peer disk state')
|
help='Check peer disk state')
|
||||||
|
|
||||||
parser.add_argument('--check-suspended', action='store_true',
|
parser.add_argument('--check-suspended', action='store_true',
|
||||||
help='Check if resource is suspended')
|
help='Check if resource is suspended')
|
||||||
|
|
||||||
parser.add_argument('--check-promotion', action='store_true',
|
parser.add_argument('--check-promotion', action='store_true',
|
||||||
help='Check if resource may be promoted (disabled by default for diskless clients)')
|
help='Check if resource may be promoted')
|
||||||
|
|
||||||
parser.add_argument('--require-promotion-capability', action='store_true',
|
parser.add_argument('--require-promotion-capability', action='store_true',
|
||||||
help='Warn if Secondary resources cannot be promoted (usually OK for diskless clients)')
|
help='Warn if Secondary resources cannot be promoted')
|
||||||
|
|
||||||
parser.add_argument('--check-quorum', action='store_true',
|
parser.add_argument('--check-quorum', action='store_true',
|
||||||
help='Check quorum status')
|
help='Check quorum status')
|
||||||
|
|
||||||
parser.add_argument('--check-congestion', action='store_true',
|
parser.add_argument('--check-congestion', action='store_true',
|
||||||
help='Check network congestion')
|
help='Check network congestion')
|
||||||
|
|
||||||
parser.add_argument('--ignore-transient-congestion', action='store_true',
|
parser.add_argument('--ignore-transient-congestion', action='store_true',
|
||||||
help='Ignore transient congestion warnings (recommended for busy networks)')
|
help='Ignore transient congestion warnings')
|
||||||
|
|
||||||
parser.add_argument('--check-client', action='store_true',
|
parser.add_argument('--check-client', action='store_true',
|
||||||
help='Check if running in client mode')
|
help='Check if running in client mode')
|
||||||
|
|
||||||
parser.add_argument('--check-resync-suspended', action='store_true',
|
parser.add_argument('--check-resync-suspended', action='store_true',
|
||||||
help='Check if resync is suspended')
|
help='Check if resync is suspended')
|
||||||
|
|
||||||
|
# LINSTOR options
|
||||||
parser.add_argument('--check-linstor', action='store_true',
|
parser.add_argument('--check-linstor', action='store_true',
|
||||||
help='Check LINSTOR status (requires linstor command)')
|
help='Check LINSTOR status (nodes, storage pools, resources)')
|
||||||
|
|
||||||
|
parser.add_argument('--check-linstor-errors', action='store_true',
|
||||||
|
help='Check LINSTOR error reports')
|
||||||
|
|
||||||
|
parser.add_argument('--linstor-error-warning', type=int, default=10,
|
||||||
|
help='Warning threshold for error reports (default: 10)')
|
||||||
|
|
||||||
|
parser.add_argument('--linstor-error-critical', type=int, default=50,
|
||||||
|
help='Critical threshold for error reports (default: 50)')
|
||||||
|
|
||||||
# Behavior options
|
# Behavior options
|
||||||
parser.add_argument('--require-primary', action='store_true',
|
parser.add_argument('--require-primary', action='store_true',
|
||||||
help='Warn if resource is not Primary')
|
help='Warn if resource is not Primary')
|
||||||
|
|
||||||
parser.add_argument('--warn-on-sync', action='store_true',
|
parser.add_argument('--warn-on-sync', action='store_true',
|
||||||
help='Warn when synchronization is in progress (default: OK)')
|
help='Warn when synchronization is in progress (default: OK)')
|
||||||
|
|
||||||
parser.add_argument('--performance-data', action='store_true',
|
parser.add_argument('--performance-data', action='store_true',
|
||||||
help='Include performance data for graphing')
|
help='Include performance data for graphing')
|
||||||
|
|
||||||
parser.add_argument('-v', '--verbose', action='store_true',
|
parser.add_argument('-v', '--verbose', action='store_true',
|
||||||
help='Verbose output (show all status messages)')
|
help='Verbose output (show all status messages)')
|
||||||
|
|
||||||
parser.add_argument('--version', action='version', version='%(prog)s 1.2')
|
parser.add_argument('--version', action='version', version='%(prog)s 2.0')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# If --all is specified, enable all checks EXCEPT require-promotion-capability
|
# If --all is specified, enable all DRBD checks
|
||||||
if args.all:
|
if args.all:
|
||||||
args.check_role = True
|
args.check_role = True
|
||||||
args.check_disk = True
|
args.check_disk = True
|
||||||
@@ -517,24 +708,22 @@ Examples:
|
|||||||
args.check_replication = True
|
args.check_replication = True
|
||||||
args.check_peer_disk = True
|
args.check_peer_disk = True
|
||||||
args.check_suspended = True
|
args.check_suspended = True
|
||||||
# Don't enable check_promotion by default - diskless clients can't promote
|
|
||||||
# args.check_promotion = True
|
|
||||||
args.check_quorum = True
|
args.check_quorum = True
|
||||||
args.check_congestion = True
|
args.check_congestion = True
|
||||||
args.check_client = True
|
args.check_client = True
|
||||||
args.check_resync_suspended = True
|
args.check_resync_suspended = True
|
||||||
args.performance_data = True
|
args.performance_data = True
|
||||||
|
|
||||||
# If no checks specified, enable basic checks
|
# If no checks specified, enable basic checks
|
||||||
if not any([args.check_role, args.check_disk, args.check_connection,
|
if not any([args.check_role, args.check_disk, args.check_connection,
|
||||||
args.check_replication, args.check_peer_disk, args.check_suspended,
|
args.check_replication, args.check_peer_disk, args.check_suspended,
|
||||||
args.check_promotion, args.check_quorum, args.check_congestion,
|
args.check_promotion, args.check_quorum, args.check_congestion,
|
||||||
args.check_client, args.check_resync_suspended]):
|
args.check_client, args.check_resync_suspended, args.check_linstor]):
|
||||||
args.check_role = True
|
args.check_role = True
|
||||||
args.check_disk = True
|
args.check_disk = True
|
||||||
args.check_connection = True
|
args.check_connection = True
|
||||||
args.check_replication = True
|
args.check_replication = True
|
||||||
|
|
||||||
# Run checks
|
# Run checks
|
||||||
monitor = DRBDMonitor()
|
monitor = DRBDMonitor()
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user