Update check_drdb_linstor.py

2026-02-02 12:47:33 +01:00
parent e2c21d9afc
commit 65ebe57d4f
1 changed files with 404 additions and 215 deletions
--- a/check_drdb_linstor.py
+++ b/check_drdb_linstor.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 """
 Nagios plugin for comprehensive DRBD/LINSTOR monitoring
-Author: @linuxiarz.pl Mateusz Gruszczyński
+Author: Custom Nagios DRBD Check
 License: GPL v3
+Version: 2.0 - Enhanced with full Linstor monitoring
 """

 import argparse
@@ -25,7 +26,7 @@ class DRBDMonitor:
        self.warnings = []
        self.criticals = []
        self.ok_messages = []
-        
+
    def execute_command(self, cmd: List[str]) -> Tuple[int, str, str]:
        """Execute system command and return returncode, stdout, stderr"""
        try:
@@ -41,7 +42,21 @@ class DRBDMonitor:
            return 124, "", "Command timeout"
        except Exception as e:
            return 1, "", str(e)
-    
+
+    def parse_linstor_json(self, stdout: str) -> List:
+        """Parse Linstor JSON - handles [[...]] structure"""
+        try:
+            data = json.loads(stdout)
+
+            # Linstor returns [[...]] structure
+            if isinstance(data, list) and len(data) > 0:
+                if isinstance(data[0], list) and len(data[0]) > 0:
+                    return data[0]  # Return inner list
+                return data
+            return []
+        except:
+            return []
+
    def parse_events2_output(self, output: str) -> Dict:
        """Parse drbdsetup events2 --now --statistics output"""
        data = {
@@ -50,25 +65,25 @@ class DRBDMonitor:
            'devices': {},
            'peer_devices': {}
        }
-        
+
        for line in output.strip().split('\n'):
            if not line or line.startswith('#'):
                continue
-                
+
            parts = line.split()
            if len(parts) < 2:
                continue
-                
+
            event_type = parts[0]  # exists, create, change, destroy
            object_type = parts[1]  # resource, connection, device, peer-device
-            
+
            # Parse key:value pairs
            props = {}
            for part in parts[2:]:
                if ':' in part:
                    key, value = part.split(':', 1)
                    props[key] = value
-            
+
            # Store data by object type
            if object_type == 'resource':
                res_name = props.get('name', 'unknown')
@@ -86,16 +101,16 @@ class DRBDMonitor:
                volume = props.get('volume', '0')
                key = f"{peer_name}:{volume}"
                data['peer_devices'][key] = props
-                
+
        return data
-    
+
    def check_resource_status(self, resource_data: Dict, args):
        """Check resource role and status"""
        for res_name, props in resource_data.items():
            role = props.get('role', 'Unknown')
            suspended = props.get('suspended', 'no')
            may_promote = props.get('may_promote', 'no')
-            
+
            # Check role
            if args.check_role:
                if role not in ['Primary', 'Secondary']:
@@ -106,18 +121,16 @@ class DRBDMonitor:
                    self.warnings.append(f"Resource {res_name}: Role is Secondary, expected Primary")
                else:
                    self.ok_messages.append(f"Resource {res_name}: Role={role}")
-            
+
            # Check suspended state
            if args.check_suspended and suspended == 'yes':
                self.criticals.append(f"Resource {res_name}: SUSPENDED")
-            
+
            # Check promotion capability - ONLY if resource is Secondary and explicitly required
-            # For diskless clients this is normal behavior
            if args.check_promotion and may_promote == 'no' and role == 'Secondary':
-                # Don't warn for diskless resources (clients), only if explicitly checking
                if args.require_promotion_capability:
                    self.warnings.append(f"Resource {res_name}: Cannot be promoted")
-    
+
    def check_connection_status(self, connection_data: Dict, args):
        """Check connection state between peers"""
        for conn_name, props in connection_data.items():
@@ -125,11 +138,11 @@ class DRBDMonitor:
            role = props.get('role', 'Unknown')
            congested = props.get('congested', 'no')
            peer_node_id = props.get('peer-node-id', 'unknown')
-            
+
            # Check connection state
            if args.check_connection:
                if connection not in ['Connected', 'StandAlone']:
-                    if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure', 
+                    if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure',  
                                     'ProtocolError', 'TearDown', 'Unconnected', 'Disconnecting']:
                        self.criticals.append(f"Connection {conn_name}: State={connection}")
                    elif connection in ['WFConnection', 'WFReportParams']:
@@ -138,13 +151,12 @@ class DRBDMonitor:
                        self.criticals.append(f"Connection {conn_name}: Unknown state {connection}")
                else:
                    self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}")
-            
-            # Check congestion - only warn if it's persistent or critical
+
+            # Check congestion
            if args.check_congestion and congested == 'yes':
-                # Congestion can be temporary, so only warn instead of critical
                if not args.ignore_transient_congestion:
                    self.warnings.append(f"Connection {conn_name}: CONGESTED")
-    
+
    def check_device_status(self, device_data: Dict, args):
        """Check device/volume disk state"""
        for dev_key, props in device_data.items():
@@ -154,7 +166,7 @@ class DRBDMonitor:
            minor = props.get('minor', 'unknown')
            client = props.get('client', 'no')
            quorum = props.get('quorum', 'yes')
-            
+
            # Check disk state
            if args.check_disk:
                if disk not in ['UpToDate', 'Diskless']:
@@ -168,15 +180,15 @@ class DRBDMonitor:
                        self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}")
                else:
                    self.ok_messages.append(f"Device {dev_name} vol:{volume} (minor:{minor}): Disk={disk}")
-            
+
            # Check quorum - but not for diskless clients
            if args.check_quorum and quorum == 'no' and disk != 'Diskless':
                self.criticals.append(f"Device {dev_name} vol:{volume}: NO QUORUM")
-            
+
            # Check client mode
            if args.check_client and client == 'yes':
                self.ok_messages.append(f"Device {dev_name} vol:{volume}: Running in client mode")
-    
+
    def check_peer_device_status(self, peer_device_data: Dict, args):
        """Check peer device replication state"""
        for peer_key, props in peer_device_data.items():
@@ -187,12 +199,11 @@ class DRBDMonitor:
            resync_suspended = props.get('resync-suspended', 'no')
            peer_client = props.get('peer-client', 'no')
            peer_node_id = props.get('peer-node-id', 'unknown')
-            
+
            # Check replication state
            if args.check_replication:
                if replication not in ['Established', 'Off']:
                    if replication in ['SyncSource', 'SyncTarget']:
-                        # Synchronization in progress - warning or OK depending on config
                        if args.warn_on_sync:
                            self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})")
                        else:
@@ -209,7 +220,7 @@ class DRBDMonitor:
                        self.criticals.append(f"Peer {peer_name} vol:{volume}: Replication={replication}")
                else:
                    self.ok_messages.append(f"Peer {peer_name} (node-{peer_node_id}) vol:{volume}: {replication}")
-            
+
            # Check peer disk state
            if args.check_peer_disk:
                if peer_disk not in ['UpToDate', 'Diskless', 'DUnknown']:
@@ -217,175 +228,348 @@ class DRBDMonitor:
                        self.criticals.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
                    else:
                        self.warnings.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
-            
+
            # Check resync suspended
            if args.check_resync_suspended and resync_suspended == 'yes':
                self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED")
-    
-    def get_statistics(self, resource: str = 'all') -> Dict:
-        """Get DRBD statistics from drbdsetup events2 --statistics"""
-        cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource]
-        rc, stdout, stderr = self.execute_command(cmd)
-        
-        if rc != 0:
-            return {}
-        
-        stats = {}
-        for line in stdout.strip().split('\n'):
-            parts = line.split()
-            if len(parts) < 2:
-                continue
-            
-            # Look for statistics in the output
-            for part in parts[2:]:
-                if ':' in part:
-                    key, value = part.split(':', 1)
-                    try:
-                        stats[key] = int(value)
-                    except ValueError:
-                        stats[key] = value
-        
-        return stats
-    
-    def add_performance_data(self, device_data: Dict, peer_device_data: Dict):
-        """Add performance data for Nagios"""
-        # Count resources, devices, connections
-        resource_count = len(device_data)
-        self.perfdata.append(f"resources={resource_count}")
-        
-        # Count by disk state
-        uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate')
-        self.perfdata.append(f"uptodate_devices={uptodate}")
-        
-        # Count by replication state
-        established = sum(1 for p in peer_device_data.values() 
-                         if p.get('replication') == 'Established')
-        syncing = sum(1 for p in peer_device_data.values() 
-                     if p.get('replication') in ['SyncSource', 'SyncTarget'])
-        self.perfdata.append(f"established_replications={established}")
-        self.perfdata.append(f"syncing_replications={syncing}")
-    
-    def check_linstor_status(self, args):
-        """Check LINSTOR specific status if available"""
+
+    def check_linstor_nodes(self, args):
+        """Check Linstor node status and communication"""
        if not args.check_linstor:
            return
-        
-        # Check if linstor command is available
-        cmd = ['which', 'linstor']
-        rc, _, _ = self.execute_command(cmd)
-        if rc != 0:
-            if args.verbose:
-                self.ok_messages.append("LINSTOR: Command not available (optional)")
-            return
-        
-        # Get resource list - try different output formats
-        cmd = ['linstor', '--machine-readable', 'resource', 'list']
+
+        cmd = ['linstor', '-m', 'node', 'list']
        rc, stdout, stderr = self.execute_command(cmd)
-        
+
        if rc != 0:
            if args.verbose:
-                self.warnings.append(f"LINSTOR: Failed to get resource list: {stderr}")
+                self.warnings.append(f"LINSTOR: Cannot get node list")
            return
-        
+
        try:
-            # Parse JSON output
-            linstor_output = json.loads(stdout)
-            
-            # LINSTOR returns an array, first check if it's valid
-            if not isinstance(linstor_output, list) or len(linstor_output) == 0:
-                if args.verbose:
-                    self.ok_messages.append("LINSTOR: No data returned")
-                return
-            
-            # Try to extract resource data from various possible formats
-            linstor_resource_count = 0
-            linstor_volume_count = 0
-            
-            # Format 1: Array of response objects with 'resources' key
-            for item in linstor_output:
-                if isinstance(item, dict):
-                    # Try 'resources' key
-                    resources = item.get('resources', [])
-                    if resources and isinstance(resources, list):
-                        for res in resources:
-                            if isinstance(res, dict):
-                                linstor_resource_count += 1
-                                volumes = res.get('vlms', []) or res.get('volumes', [])
-                                if isinstance(volumes, list):
-                                    linstor_volume_count += len(volumes)
-            
-            # Format 2: Direct array of resources (older format)
-            if linstor_resource_count == 0:
-                for item in linstor_output:
-                    if isinstance(item, dict) and 'name' in item:
-                        linstor_resource_count += 1
-                        volumes = item.get('vlms', []) or item.get('volumes', [])
-                        if isinstance(volumes, list):
-                            linstor_volume_count += len(volumes)
-            
-            if linstor_resource_count > 0:
-                self.ok_messages.append(
-                    f"LINSTOR: {linstor_resource_count} resources, "
-                    f"{linstor_volume_count} volumes"
-                )
-                
-                # Add performance data
-                if args.performance_data:
-                    self.perfdata.append(f"linstor_resources={linstor_resource_count}")
-                    self.perfdata.append(f"linstor_volumes={linstor_volume_count}")
-            else:
-                if args.verbose:
-                    self.ok_messages.append("LINSTOR: No resources found")
-                    
-        except json.JSONDecodeError as e:
-            if args.verbose:
-                self.warnings.append(f"LINSTOR: JSON parse error: {str(e)[:50]}")
+            nodes = self.parse_linstor_json(stdout)
+
+            online_nodes = 0
+            offline_nodes = []
+            evicted_nodes = []
+
+            for node in nodes:
+                if not isinstance(node, dict):
+                    continue
+
+                node_name = node.get('name', 'unknown')
+                connection_status = node.get('connection_status', 'UNKNOWN')
+                node_type = node.get('type', 'UNKNOWN')
+                flags = node.get('flags', [])
+
+                # Check for EVICTED flag
+                if 'EVICTED' in flags:
+                    evicted_nodes.append(f"{node_name}({node_type})")
+                    continue
+
+                if connection_status == 'ONLINE':
+                    online_nodes += 1
+                else:
+                    offline_nodes.append(f"{node_name}({node_type}):{connection_status}")
+
+            if offline_nodes:
+                self.criticals.append(f"LINSTOR: Nodes offline: {', '.join(offline_nodes)}")
+
+            if evicted_nodes:
+                self.warnings.append(f"LINSTOR: Nodes evicted: {', '.join(evicted_nodes)}")
+
+            if not offline_nodes and not evicted_nodes and online_nodes > 0:
+                self.ok_messages.append(f"LINSTOR: All {online_nodes} nodes online")
+
+            if args.performance_data:
+                self.perfdata.append(f"linstor_nodes_online={online_nodes}")
+                self.perfdata.append(f"linstor_nodes_offline={len(offline_nodes)}")
+
        except Exception as e:
            if args.verbose:
-                self.warnings.append(f"LINSTOR: Processing error: {str(e)[:50]}")
-    
+                self.warnings.append(f"LINSTOR: Error checking nodes: {str(e)[:50]}")
+
+    def check_linstor_storage_pools(self, args):
+        """Check Linstor storage pool status and errors"""
+        if not args.check_linstor:
+            return
+
+        cmd = ['linstor', '-m', 'storage-pool', 'list']
+        rc, stdout, stderr = self.execute_command(cmd)
+
+        if rc != 0:
+            if args.verbose:
+                self.warnings.append(f"LINSTOR: Cannot get storage pool list")
+            return
+
+        try:
+            storage_pools = self.parse_linstor_json(stdout)
+
+            pool_errors = []
+            pool_warnings = []
+            total_capacity = 0
+            total_free = 0
+            pools_checked = 0
+
+            for pool in storage_pools:
+                if not isinstance(pool, dict):
+                    continue
+
+                node_name = pool.get('node_name', 'unknown')
+                pool_name = pool.get('stor_pool_name', 'unknown')
+                provider_kind = pool.get('provider_kind', 'unknown')
+
+                # Check for errors in reports
+                reports = pool.get('reports', [])
+                for report in reports:
+                    if not isinstance(report, dict):
+                        continue
+
+                    message = report.get('message', '')
+                    ret_code = report.get('ret_code', 0)
+
+                    # Check for specific error patterns
+                    if 'ApiRcException' in message or 'RuntimeException' in message:
+                        pool_errors.append(f"{node_name}:{pool_name} - ApiRcException")
+                    elif 'Failed to query free space' in message:
+                        pool_errors.append(f"{node_name}:{pool_name} - Cannot query free space")
+                    elif 'query free space from storage pool' in message.lower():
+                        pool_errors.append(f"{node_name}:{pool_name} - Storage pool query failed")
+                    elif ret_code not in [0, None] and message:
+                        pool_warnings.append(f"{node_name}:{pool_name} - {message[:60]}")
+
+                # Skip DISKLESS pools
+                if provider_kind == 'DISKLESS':
+                    continue
+
+                # Check free space
+                free_capacity = pool.get('free_capacity', None)
+                total_capacity_pool = pool.get('total_capacity', None)
+
+                if free_capacity is not None and total_capacity_pool is not None:
+                    pools_checked += 1
+                    total_capacity += total_capacity_pool
+                    total_free += free_capacity
+
+                    # Calculate usage percentage
+                    if total_capacity_pool > 0:
+                        usage_percent = ((total_capacity_pool - free_capacity) / total_capacity_pool) * 100
+
+                        if usage_percent >= 95:
+                            pool_errors.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full")
+                        elif usage_percent >= 90:
+                            pool_warnings.append(f"{node_name}:{pool_name} - {usage_percent:.1f}% full")
+                else:
+                    # Cannot read capacity - potential issue
+                    if provider_kind not in ['DISKLESS']:
+                        pool_warnings.append(f"{node_name}:{pool_name} - Cannot read capacity")
+
+            # Report errors
+            if pool_errors:
+                self.criticals.append(f"LINSTOR: Storage pool errors: {'; '.join(pool_errors[:3])}")
+
+            if pool_warnings:
+                self.warnings.append(f"LINSTOR: Storage pool warnings: {'; '.join(pool_warnings[:3])}")
+
+            if not pool_errors and not pool_warnings and pools_checked > 0:
+                self.ok_messages.append(f"LINSTOR: {pools_checked} storage pools OK")
+
+            # Performance data
+            if total_capacity > 0 and args.performance_data:
+                usage_percent = ((total_capacity - total_free) / total_capacity) * 100
+                self.perfdata.append(f"linstor_storage_usage={usage_percent:.2f}%")
+                self.perfdata.append(f"linstor_storage_free_mb={total_free // 1024}")
+                self.perfdata.append(f"linstor_storage_total_mb={total_capacity // 1024}")
+
+        except Exception as e:
+            if args.verbose:
+                self.warnings.append(f"LINSTOR: Error checking storage pools: {str(e)[:50]}")
+
+    def check_linstor_resources(self, args):
+        """Check Linstor resource status and synchronization"""
+        if not args.check_linstor:
+            return
+
+        cmd = ['linstor', '-m', 'resource', 'list']
+        rc, stdout, stderr = self.execute_command(cmd)
+
+        if rc != 0:
+            if args.verbose:
+                self.warnings.append(f"LINSTOR: Cannot get resource list")
+            return
+
+        try:
+            resources = self.parse_linstor_json(stdout)
+
+            syncing_resources = []
+            error_resources = []
+            total_volumes = 0
+            uptodate_count = 0
+
+            for resource in resources:
+                if not isinstance(resource, dict):
+                    continue
+
+                node_name = resource.get('node_name', 'unknown')
+                resource_name = resource.get('name', 'unknown')
+
+                # Check volumes for sync status
+                volumes = resource.get('vlms', [])
+                for volume in volumes:
+                    if not isinstance(volume, dict):
+                        continue
+
+                    total_volumes += 1
+                    vol_nr = volume.get('vlm_nr', 0)
+
+                    # Check DRBD state
+                    if 'layer_data_list' in volume:
+                        for layer in volume['layer_data_list']:
+                            if not isinstance(layer, dict):
+                                continue
+
+                            if layer.get('type') == 'DRBD':
+                                drbd_data = layer.get('data', {})
+                                disk_state = drbd_data.get('disk_state', '')
+                                replication_state = drbd_data.get('repl_state', '')
+
+                                # Check for UpToDate
+                                if disk_state == 'UpToDate':
+                                    uptodate_count += 1
+
+                                # Check for sync in progress
+                                if replication_state in ['SyncSource', 'SyncTarget', 'PausedSyncS', 'PausedSyncT']:
+                                    syncing_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})")
+
+                                # Check for error states
+                                if disk_state in ['Failed', 'Inconsistent']:
+                                    error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({disk_state})")
+
+                                if replication_state in ['StandAlone', 'Disconnecting', 'NetworkFailure']:
+                                    error_resources.append(f"{resource_name}@{node_name}:{vol_nr} ({replication_state})")
+
+            # Report sync status
+            if syncing_resources:
+                sync_count = len(syncing_resources)
+                if sync_count <= 3:
+                    self.warnings.append(f"LINSTOR: Resources syncing: {', '.join(syncing_resources)}")
+                else:
+                    self.warnings.append(f"LINSTOR: {sync_count} resources syncing: {', '.join(syncing_resources[:3])}...")
+
+            if error_resources:
+                self.criticals.append(f"LINSTOR: Resources in error state: {', '.join(error_resources)}")
+
+            if not syncing_resources and not error_resources and total_volumes > 0:
+                self.ok_messages.append(f"LINSTOR: {uptodate_count}/{total_volumes} volumes UpToDate")
+
+            # Performance data
+            if args.performance_data:
+                self.perfdata.append(f"linstor_volumes_total={total_volumes}")
+                self.perfdata.append(f"linstor_volumes_syncing={len(syncing_resources)}")
+                self.perfdata.append(f"linstor_volumes_errors={len(error_resources)}")
+                self.perfdata.append(f"linstor_volumes_uptodate={uptodate_count}")
+
+        except Exception as e:
+            if args.verbose:
+                self.warnings.append(f"LINSTOR: Error checking resources: {str(e)[:50]}")
+
+    def check_linstor_error_reports(self, args):
+        """Check for Linstor error reports in the system"""
+        if not args.check_linstor or not args.check_linstor_errors:
+            return
+
+        cmd = ['linstor', '-m', 'error-reports', 'list']
+        rc, stdout, stderr = self.execute_command(cmd)
+
+        if rc != 0:
+            return
+
+        try:
+            error_reports = self.parse_linstor_json(stdout)
+
+            if error_reports and len(error_reports) > 0:
+                report_count = len(error_reports)
+
+                # Apply threshold
+                if report_count > args.linstor_error_critical:
+                    self.criticals.append(f"LINSTOR: {report_count} error reports in system")
+                elif report_count > args.linstor_error_warning:
+                    self.warnings.append(f"LINSTOR: {report_count} error reports in system")
+                elif args.verbose:
+                    self.ok_messages.append(f"LINSTOR: {report_count} error reports (below threshold)")
+
+                if args.performance_data:
+                    self.perfdata.append(f"linstor_error_reports={report_count}")
+
+        except:
+            pass
+
+    def add_performance_data(self, device_data: Dict, peer_device_data: Dict, args):
+        """Add performance data for Nagios"""
+        if not args.performance_data:
+            return
+
+        # Count resources, devices, connections
+        resource_count = len(device_data)
+        self.perfdata.append(f"drbd_resources={resource_count}")
+
+        # Count by disk state
+        uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate')
+        self.perfdata.append(f"drbd_uptodate_devices={uptodate}")
+
+        # Count by replication state
+        established = sum(1 for p in peer_device_data.values()  
+                         if p.get('replication') == 'Established')
+        syncing = sum(1 for p in peer_device_data.values()  
+                     if p.get('replication') in ['SyncSource', 'SyncTarget'])
+        self.perfdata.append(f"drbd_established_replications={established}")
+        self.perfdata.append(f"drbd_syncing_replications={syncing}")
+
    def run_checks(self, args):
        """Main check execution"""
        # Get DRBD events2 output
        resource_filter = args.resource if args.resource else 'all'
        cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource_filter]
-        
+
        rc, stdout, stderr = self.execute_command(cmd)
-        
+
        if rc != 0:
            print(f"CRITICAL - Failed to execute drbdsetup: {stderr}")
            sys.exit(STATE_CRITICAL)
-        
+
        if not stdout.strip():
            print("CRITICAL - No DRBD resources found")
            sys.exit(STATE_CRITICAL)
-        
+
        # Parse output
        data = self.parse_events2_output(stdout)
-        
-        # Run all checks
+
+        # Run DRBD checks
        if data['resources']:
            self.check_resource_status(data['resources'], args)
-        
+
        if data['connections']:
            self.check_connection_status(data['connections'], args)
-        
+
        if data['devices']:
            self.check_device_status(data['devices'], args)
-        
+
        if data['peer_devices']:
            self.check_peer_device_status(data['peer_devices'], args)
-        
-        # Add performance data
-        if args.performance_data:
-            self.add_performance_data(data['devices'], data['peer_devices'])
-        
-        # Check LINSTOR if requested
-        self.check_linstor_status(args)
-        
+
+        # Add DRBD performance data
+        self.add_performance_data(data['devices'], data['peer_devices'], args)
+
+        # Run LINSTOR checks if enabled
+        if args.check_linstor:
+            self.check_linstor_nodes(args)
+            self.check_linstor_storage_pools(args)
+            self.check_linstor_resources(args)
+            self.check_linstor_error_reports(args)
+
        # Determine final status
        return self.get_final_status(args)
-    
+
    def get_final_status(self, args) -> int:
        """Determine final Nagios status and output"""
        if self.criticals:
@@ -404,14 +588,14 @@ class DRBDMonitor:
        else:
            status = STATE_OK
            status_text = "OK"
-            messages = self.ok_messages if args.verbose else ["All DRBD checks passed"]
-        
+            messages = self.ok_messages if args.verbose else ["All DRBD/LINSTOR checks passed"]
+
        # Build output
        output = f"{status_text} - {'; '.join(messages)}"
-        
+
        if self.perfdata:
            output += " | " + " ".join(self.perfdata)
-        
+
        print(output)
        return status

@@ -422,94 +606,101 @@ def main():
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  # Check all parameters for all resources
-  %(prog)s --all
-  
+  # Check all DRBD and LINSTOR parameters
+  %(prog)s --all --check-linstor
+
  # Check specific resource
  %(prog)s --resource r0 --all
-  
+
  # Check only connection and replication status
  %(prog)s --check-connection --check-replication
-  
-  # Check with performance data and verbose output
-  %(prog)s --all --performance-data --verbose
-  
-  # Check LINSTOR status as well
-  %(prog)s --all --check-linstor
-  
-  # Ignore transient congestion warnings
-  %(prog)s --all --ignore-transient-congestion
+
+  # Full check with performance data and verbose output
+  %(prog)s --all --check-linstor --check-linstor-errors --performance-data --verbose
+
+  # Ignore transient issues
+  %(prog)s --all --check-linstor --ignore-transient-congestion
        """
    )
-    
+
    # Resource selection
    parser.add_argument('-r', '--resource',
                       help='DRBD resource name to check (default: all)')
-    
-    # Check options
+
+    # Check options - DRBD
    parser.add_argument('--all', action='store_true',
-                       help='Enable all checks (recommended)')
-    
+                       help='Enable all DRBD checks (recommended)')
+
    parser.add_argument('--check-role', action='store_true',
                       help='Check resource role (Primary/Secondary)')
-    
+
    parser.add_argument('--check-disk', action='store_true',
                       help='Check disk state (UpToDate/Inconsistent/etc)')
-    
+
    parser.add_argument('--check-connection', action='store_true',
                       help='Check connection state between nodes')
-    
+
    parser.add_argument('--check-replication', action='store_true',
                       help='Check replication state (Established/SyncSource/etc)')
-    
+
    parser.add_argument('--check-peer-disk', action='store_true',
                       help='Check peer disk state')
-    
+
    parser.add_argument('--check-suspended', action='store_true',
                       help='Check if resource is suspended')
-    
+
    parser.add_argument('--check-promotion', action='store_true',
-                       help='Check if resource may be promoted (disabled by default for diskless clients)')
-    
+                       help='Check if resource may be promoted')
+
    parser.add_argument('--require-promotion-capability', action='store_true',
-                       help='Warn if Secondary resources cannot be promoted (usually OK for diskless clients)')
-    
+                       help='Warn if Secondary resources cannot be promoted')
+
    parser.add_argument('--check-quorum', action='store_true',
                       help='Check quorum status')
-    
+
    parser.add_argument('--check-congestion', action='store_true',
                       help='Check network congestion')
-    
+
    parser.add_argument('--ignore-transient-congestion', action='store_true',
-                       help='Ignore transient congestion warnings (recommended for busy networks)')
-    
+                       help='Ignore transient congestion warnings')
+
    parser.add_argument('--check-client', action='store_true',
                       help='Check if running in client mode')
-    
+
    parser.add_argument('--check-resync-suspended', action='store_true',
                       help='Check if resync is suspended')
-    
+
+    # LINSTOR options
    parser.add_argument('--check-linstor', action='store_true',
-                       help='Check LINSTOR status (requires linstor command)')
-    
+                       help='Check LINSTOR status (nodes, storage pools, resources)')
+
+    parser.add_argument('--check-linstor-errors', action='store_true',
+                       help='Check LINSTOR error reports')
+
+    parser.add_argument('--linstor-error-warning', type=int, default=10,
+                       help='Warning threshold for error reports (default: 10)')
+
+    parser.add_argument('--linstor-error-critical', type=int, default=50,
+                       help='Critical threshold for error reports (default: 50)')
+
    # Behavior options
    parser.add_argument('--require-primary', action='store_true',
                       help='Warn if resource is not Primary')
-    
+
    parser.add_argument('--warn-on-sync', action='store_true',
                       help='Warn when synchronization is in progress (default: OK)')
-    
+
    parser.add_argument('--performance-data', action='store_true',
                       help='Include performance data for graphing')
-    
+
    parser.add_argument('-v', '--verbose', action='store_true',
                       help='Verbose output (show all status messages)')
-    
-    parser.add_argument('--version', action='version', version='%(prog)s 1.2')
-    
+
+    parser.add_argument('--version', action='version', version='%(prog)s 2.0')
+
    args = parser.parse_args()
-    
-    # If --all is specified, enable all checks EXCEPT require-promotion-capability
+
+    # If --all is specified, enable all DRBD checks
    if args.all:
        args.check_role = True
        args.check_disk = True
@@ -517,24 +708,22 @@ Examples:
        args.check_replication = True
        args.check_peer_disk = True
        args.check_suspended = True
-        # Don't enable check_promotion by default - diskless clients can't promote
-        # args.check_promotion = True
        args.check_quorum = True
        args.check_congestion = True
        args.check_client = True
        args.check_resync_suspended = True
        args.performance_data = True
-    
+
    # If no checks specified, enable basic checks
    if not any([args.check_role, args.check_disk, args.check_connection,
                args.check_replication, args.check_peer_disk, args.check_suspended,
                args.check_promotion, args.check_quorum, args.check_congestion,
-                args.check_client, args.check_resync_suspended]):
+                args.check_client, args.check_resync_suspended, args.check_linstor]):
        args.check_role = True
        args.check_disk = True
        args.check_connection = True
        args.check_replication = True
-    
+
    # Run checks
    monitor = DRBDMonitor()
    try: