# # Copyright (c) 2001-2014 NetApp, Inc., All Rights Reserved # Any use, modification, or distribution is prohibited # without prior written consent from NetApp, Inc. # ## @summary API for testing Health Monitor and Fabric Health Monitor ## @author yulun@netapp.com ## @status shared ## @pod here package NACL::MTask::HealthMonitor; =head1 NAME NACL::MTask::HealthMonitor =head1 DESCRIPTION C provides methods related to checking for and validating Health Monitor and Fabric Health Monitor alerts. =head1 ATTRIBUTES =head2 command_interface (Required, isa 'NACL::C::Node') A command interface object or an arrayref to multiple node objects which represent the node to check for alerts. See L. =cut use strict; use warnings; use Params::Validate qw(validate_with HASHREF SCALAR ARRAYREF); use NACL::InstantComponent qw(instant_component); use San::SanUtil; use NATE::BaseException qw(:try); use NATE::Log qw(log_global); my $log = log_global(); my $may_enter = $log->may_enter(); my $may_exit = $log->may_exit(); # Exported Globals $HealthMonitor::TRUE = 1; $HealthMonitor::FALSE = 0; ######################## Methods for task users here ####################### =head1 METHODS =head2 new my $hm_obj = NACL::MTask::HealthMonitor->new ( command_interface => \@ci # Specify multiple nodes to track ); Return a new NACL::MTask::HealthMonitor object. =cut use Class::MethodMaker [ new => [ '-hash', '-init', 'new' ], scalar => [ { '-type' => 'ARRAY' }, 'command_interface' ] ]; sub init { $log->enter() if $may_enter; my $self = shift; validate_with( params => \@_, spec => { command_interface => { type => ARRAYREF }, }, ); # Create hash of node api sets. my $apisets = $self->_get_nodes_api( 'nodes' => \@{$self->command_interface()}); $self->{apisets} = $apisets; # Create hash of clusters of node names # save shm alerts already shows in the system. $log->comment("Saving initial shm alerts on the node."); foreach my $node (keys $self->{apisets}) { my @shm_alerts = @{$self->_get_shm_alert(apiset=>$self->{apisets}->{$node})}; if (!@shm_alerts) { $log->comment("$node: No initial shm alerts found"); } else { $self->{init_shm_alert}->{$node} = \@shm_alerts; $log->comment("$node: Initial shm alerts"); foreach my $shm (@shm_alerts) { $log->comment("$node: [$shm->{'alert-id'}] $shm->{'probable-cause-description'}"); } } } $log->exit() if $may_exit; } =head2 wait_for_alert Wait for a list of alerts to show up on both clusters. Optionally specify an amount of time to wait for the alerts to show up. $hm_obj->wait_for_alert( shm => \@shm, fhm => $fhm, timeout => 3000 # Must be specified in seconds ); =over =item C<< shm => \@shm >> (Optional) The list of hashes of the system health monitor alerts to wait for on the cluster(s). If 'node' is specified, alert will only been checked on the specific node. [ {'alert' => , 'node' => (Optional)}, ... ] e.g. To check on specific node {'alert' => 'Physical link on port 1a is offline', 'node' => 'dpgqa-mcc-interop-3250-03'}; or to check on all nodes {'alert' => 'All ISL links on Cisco_10.228.57.84 are down'}; =item C<< fhm => $fhm >> (Optional) The hashref to the fabric health monitor alerts and status to wait for on the cluster(s). key should be wwn of the switch. can be use as $fhm->{wwn}->{type} { '' => { 'type' => , 'alert' => [alert1, alert2, ...], 'status' => } } e.g. { '2000002a6ab9ce20' => { 'type' => 'switch', 'alert' => [Switch is Unreachable over Management Network, ...], 'status' => error } } for the rest of switches/bridges not specified, assuming their status should be 'ok'. =item C<< timeout => $timeout >> (Optional) The amount of time to wait for an alert before failing. Default is set to 30 minutes. =back =cut sub wait_for_alert { $log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { shm => { type => ARRAYREF, optional => 1}, fhm => { type => HASHREF, optional => 1}, timeout => { type => SCALAR, optional => 1, default => 1800 }, }, ); my $start_time = time; my $retry_time = 300; my $wait_time = $opts{'timeout'}; my $completed = 0; my $timeout = 0; do { my $node_completed = 0; foreach my $node (keys $self->{apisets}) { my ($shm_complete, $fhm_complete); if (exists $opts{'shm'}) { $log->comment("$node: Start to wait for shm alerts shows"); $shm_complete = $self->_check_shm(node => $node, expected_shm => \@{$opts{'shm'}}, mode => 'show' ); if ($shm_complete) { $log->comment("$node: All shm alerts found"); } else { $log->comment("$node: Not all shm alerts found yet. Retry in $retry_time secs."); } } else { $log->comment("shm to wait is not specified, skipping shm check"); $shm_complete = 1; } if (exists $opts{'fhm'}) { $log->comment("$node: Start to wait for fhm alerts shows"); $fhm_complete = $self->_check_fhm(node => $node, expected_fhm => $opts{'fhm'}, mode => 'show'); if ($fhm_complete) { $log->comment("$node: All fhm alerts found"); } else { $log->comment("$node: Not all fhm alerts found yet. Retry in $retry_time secs.") } } else { $log->comment("fhm to wait is not specified, skipping fhm check"); $fhm_complete = 1; } # see if all alert are found in this node. $node_completed++ if ( $shm_complete and $fhm_complete ); } # all nodes has been looped once. $timeout = 1 if ((time - $start_time) > $wait_time); $completed = 1 if ( $node_completed == scalar @{$self->command_interface()} ); if (not $completed and not $timeout) { sleep($retry_time); } } while (not $completed and not $timeout); if ($timeout or not $completed) { NATE::BaseException->throw("Alerts did not show up in $wait_time secs."); } # all alert found. my $time_elapsed = time - $start_time; $log->comment("All expected alerts show in $time_elapsed secs."); return $HealthMonitor::TRUE if ($completed); $log->exit() if $may_exit; return 1; } =head2 wait_for_clear_alert Wait for a list of alerts to clear on both clusters. If a list of alerts is given, this will check to ensure those alerts are set and then wait for them to be cleared. If a list of system health alerts is not given, then it is assumed that the user wanted all of the alerts to be cleared and only the alerts exists when the object created will exist. It is also assumed that all switches/bridges that is not specified should has status as 'ok' $hm_obj->wait_for_clear_alert(); or $hm_obj->wait_for_clear_alert( timeout => 3000 ); or $hm_obj->wait_for_alert( shm => \@shm, fhm => $fhm, timeout => 3000 # Must be specified in seconds ); =over =item C<< shm => \@shm >> (Optional) The list of hashes of the system health monitor alerts to be cleared for on the cluster(s). If 'node' is specified, alert will only been checked on the specific node. [ {'alert' => , 'node' => (Optional)}, ... ] e.g. To check on specific node {'alert' => 'Physical link on port 1a is offline', 'node' => 'dpgqa-mcc-interop-3250-03'}; or to check on all nodes {'alert' => 'All ISL links on Cisco_10.228.57.84 are down'}; =item C<< fhm => $fhm >> (Optional) The hashref to the fabric health monitor alerts and status to clear for on the cluster(s). key should be wwn of the switch. { '' => { 'type' => , 'alert' => [alert1, alert2, ...], 'status' => } } e.g. { '2000002a6ab9ce20' => { 'type' => 'switch', 'alert' => [Switch is Unreachable over Management Network, ...], 'status' => error } } for the rest of switches/bridges not specified, assuming their status should be 'ok'. If a status change is found but not expected, it will throw an exception immediately. =item C<< timeout => $timeout >> (Optional) The amount of time to wait for an alert before failing. Default is set to 30 minutes. =item C<< check_initial_shm => $check_initial_shm >> (Optional) Do additional check on shm alerts back to initial state when creating the library object after specified alerts are cleared. Default is set to true. =item C<< check_fhm_all_ok => $check_fhm_all_ok >> (Optional) Do additional check on fhm to make sure all switches/bridges show status 'ok' after specified alerts are cleared. Default is set to true. =back =cut sub wait_for_clear_alert { $log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { shm => { type => ARRAYREF, optional => 1}, fhm => { type => HASHREF, optional => 1}, timeout => { type => SCALAR, optional => 1, default => 1800 }, check_initial_shm => { type => SCALAR, optional => 1, default => 1 }, check_fhm_all_ok => { type=> SCALAR, optional => 1, default => 1 } }, ); my $start_time = time; my $retry_time = 300; my $wait_time = $opts{'timeout'}; my $check_initial_shm = $opts{'check_initial_shm'}; my $check_fhm_all_ok = $opts{'check_fhm_all_ok'}; my $completed = 0; my $timeout = 0; do { my $node_completed = 0; foreach my $node (keys $self->{apisets}) { my ($shm_complete, $fhm_complete, $init_shm_exists, $all_fhm_ok); if (exists $opts{'shm'}) { $log->comment("$node: Start to wait for shm alerts clear"); $shm_complete = $self->_check_shm(node => $node, expected_shm => \@{$opts{'shm'}}, mode => 'clear' ); if ($shm_complete) { $log->comment("$node: All shm alerts clear"); } else { $log->comment("$node: shm alerts has not cleared yet. Retry in $retry_time secs."); } } else { $log->comment("$node: shm to clear is not specified, skipping shm check"); $shm_complete = 1; } if (exists $opts{'fhm'}) { $log->comment("$node: Start to wait for fhm alerts clear"); $fhm_complete = $self->_check_fhm(node => $node, expected_fhm => $opts{'fhm'}, mode => 'clear'); if ($fhm_complete) { $log->comment("$node: All fhm alerts cleared"); } else { $log->comment("$node: fhm alerts has not cleared yet. Retry in $retry_time secs.") } } else { $log->comment("$node: fhm to clear is not specified, skipping fhm check"); $fhm_complete = 1; } # see if all alert are found in this node. # if so, check if initial shm alerts still exists. if ($shm_complete and $fhm_complete) { if ($check_initial_shm) { $log->comment("$node: Start to check if initial shm alerts exists"); $init_shm_exists = $self->_check_init_shm(node => $node); if ($init_shm_exists) { $log->comment("$node: Initial shm alerts exists after operation."); } else { $log->comment("$node: Initial shm alerts not exists after operation."); } } else { $log->comment("$node: Skip check for initial alerts"); $init_shm_exists = 1; } if ($check_fhm_all_ok) { $log->comment("$node: Start to check if all fhm shows 'ok'"); $all_fhm_ok = $self->_check_all_fhm_ok(node => $node); if ($all_fhm_ok) { $log->comment("$node: all fhm shows 'ok'"); } else { $log->comment("$node: not all fhm shows 'ok' yet"); } } else { $log->comment("$node: Skip check for all fhm shows 'ok'"); $all_fhm_ok = 1; } if ($init_shm_exists and $all_fhm_ok) { $node_completed++; } else { $log->comment("$node: retry in $retry_time secs."); } } } # all nodes has been looped once. $timeout = 1 if ((time - $start_time) > $wait_time); $completed = 1 if ( $node_completed == scalar @{$self->command_interface()} ); if (not $completed and not $timeout) { sleep($retry_time); } } while (not $completed and not $timeout); if ($timeout or not $completed) { NATE::BaseException->throw("Alerts did not clear properly in $wait_time secs. Failing the test."); } # all alert cleared. my $time_elapsed = time - $start_time; $log->comment("All expected alerts clear in $time_elapsed secs."); return $HealthMonitor::TRUE if ($completed); $log->exit() if $may_exit; return 1; } ################################## # Private Method # ################################## # require parameter: apiset (NACL::C::Node) # e.g. $self->_get_shm_alert(node=>$node); # return arrayref of current system health alerts. # or arrayref to empty array if no shm alerts found sub _get_shm_alert { my ($self, %args) = @_; my $apiset = $args{apiset}; my $res = $apiset->diagnosis_alert_get_iter(); my @parsed_res = @{$res->get_parsed_output()}; my @system_alerts; if (exists ${$parsed_res[0]->{'attributes-list'}}[0]->{'diagnosis-alert-info'}) { @system_alerts = @{${$parsed_res[0]->{'attributes-list'}}[0]->{'diagnosis-alert-info'}}; return \@system_alerts; } return []; } # require parameter: # $apiset (NACL::APISet::Node::ZAPI::CMode) # $switch_wwn (string) # e.g. $self->_get_fhm_switch(api=>%apiset, switch_wwn=>'2000002a6ab9ce20'); # return string: arrayref to switch info sub _get_fhm_switch { my ($self, %args) = @_; my $apiset = $args{apiset}; my $switch_wwn = $args{switch_wwn}; my $res = $apiset->storage_switch_get_iter(); my @parsed_res = @{$res->get_parsed_output()}; return \@{${$parsed_res[0]->{'attributes-list'}}[0]->{'storage-switch-info'}}; } # require parameter: # $apiset (NACL::APISet::Node::ZAPI::CMode) # $bridge_wwn (string) # e.g. $self->_get_fhm_bridge(api=>%apiset, bridge_wwn=>'2000002a6ab9ce20'); # return string: arrayref to bridge info sub _get_fhm_bridge { my ($self, %args) = @_; my $apiset = $args{apiset}; my $bridge_wwn = $args{bridge_wwn}; my $res = $apiset->storage_bridge_get_iter(); my @parsed_res = @{$res->get_parsed_output()}; return \@{${$parsed_res[0]->{'attributes-list'}}[0]->{'storage-bridge-info'}}; } # create APISet connection object from node # require: array of nodes (NACL::C::Node) # return: hashref of APISet (NACL::APISet::Node::ZAPI::CMode) sub _get_nodes_api { my ($self, %args) = @_; my @nodes = @{$args{'nodes'}}; my %api_hash; foreach my $node (@nodes) { my $apiset = $node->apiset( category => "Node", interface => "ZAPI", set => "CMode" ); $api_hash{$node->{'node'}} = $apiset; } return \%api_hash; } # check if shm alerts are cleared or found # passed: # $mode => show/clear # $node # \@expected_shm # # # return: # 1 if all found in this node. # 0 if not all found in this node. sub _check_shm { my ($self, %args) = @_; my $node = $args{'node'}; my @expected_shm = @{$args{'expected_shm'}}; my @current_shm = @{$self->_get_shm_alert(apiset=>$self->{apisets}->{$node})}; my $expected_alert_count = 0; my $found_shm_alert = 0; # count expected alerts on this node foreach my $expected_alert (@expected_shm) { if (!exists $expected_alert->{'node'} ) { $expected_alert_count++; } elsif (exists $expected_alert->{'node'} and $expected_alert->{'node'} eq $node){ $expected_alert_count++; } } foreach my $shm_alert (@current_shm) { $self->_print_shm($shm_alert); foreach my $expected_alert (@expected_shm) { if (exists $expected_alert->{'node'}) { # this alert is node specific if ($expected_alert->{'node'} eq $node) { if ($shm_alert->{'probable-cause-description'} =~ /$expected_alert->{'alert'}/i and $shm_alert->{'node'} eq $expected_alert->{'node'}) { $found_shm_alert++; if ( $args{mode} eq 'show') { $log->comment("$node: Found node-specific alert '$shm_alert->{'probable-cause-description'}' ". "matches given '$expected_alert->{'alert'}'"); } elsif ($args{mode} eq 'clear') { $log->comment("$node: Node-specific alert '$shm_alert->{'probable-cause-description'}' ". "matches given '$expected_alert->{'alert'}' ". "has not been cleared"); } } } } else { # this alert should show on all nodes. if ($shm_alert->{'probable-cause-description'} =~ /$expected_alert->{'alert'}/i) { $found_shm_alert++; if ( $args{mode} eq 'show') { $log->comment("$node: Found alert '$shm_alert->{'probable-cause-description'}' ". "matches given '$expected_alert->{'alert'}' "); } elsif ($args{mode} eq 'clear') { $log->comment("$node: Alert '$shm_alert->{'probable-cause-description'}' ". "matches given'$expected_alert->{'alert'}' ". "has not been cleared yet"); } } } } } # check all shm shown if ( $args{mode} eq 'show') { if (scalar @expected_shm != 0 and $found_shm_alert == $expected_alert_count) { return 1; } } elsif ($args{mode} eq 'clear') { if ($found_shm_alert == 0) { return 1; } } return 0; #TODO 1. check of $found_shm_alert > $expected_alert_count } # check if fhm alerts are cleared or found # passed: # $mode => show/clear # $node # $expected_fhm (hashref) # # specify if there should be any alert or status is not 'ok' # # for none specified, we'll assume they should be ok and no error # # return: # 1 if all found in this node. # 0 if not all found in this node. sub _check_fhm { my ($self, %args) = @_; my $node = $args{'node'}; $log->comment("$node: Start to check fhm"); my $expected_fhm = $args{'expected_fhm'}; my @current_fhm; my @switches = @{$self->_get_fhm_switch(apiset=>$self->{apisets}->{$node})}; my @bridges = @{$self->_get_fhm_bridge(apiset=>$self->{apisets}->{$node})}; my $switch_complete = 0; my $bridge_complete = 0; foreach my $switch (@switches) { $log->comment("$switch->{'name'} status: $switch->{'status'}"); my $found_fhm_msg = 0; # check specified switch show warning/error # check if exists $switch->{'storage-switch-error-list'}) ? if (exists $expected_fhm->{$switch->{'wwn'}} and $expected_fhm->{$switch->{'wwn'}}->{type} eq 'switch') { if ( $switch->{'status'} eq $expected_fhm->{$switch->{'wwn'}}->{'status'} ) { foreach my $switch_error ( @{${$switch->{'storage-switch-error-list'}}[0] ->{'storage-switch-error-info'}} ) { $log->comment("$switch->{'name'}". " show fhm error message: ". "$switch_error->{'error-message'}"); # find the shown alert from given array foreach my $expected_fhm_error (@{$expected_fhm->{$switch->{'wwn'}}->{'alert'}}) { if ( $switch_error->{'error-message'} =~ /$expected_fhm_error/i ) { $found_fhm_msg++; $log->comment("Expected fhm error found on switch: [$switch_error->{'type'}] ". "'$switch_error->{'error-message'}'". "on $switch->{'name'}"); } } } #looped through all error on this switch if ($args{mode} eq 'show' and $found_fhm_msg == scalar (@{$expected_fhm->{$switch->{'wwn'}}->{'alert'}}) ) { $switch_complete++; } elsif ($args{mode} eq 'clear' and $found_fhm_msg == 0 ) { $switch_complete++; } } } else { # if not specified, the status should be "ok" if ($switch->{'status'} ne 'ok') { NATE::BaseException->throw("switch $switch->{'name'} ". "status should be 'ok', but shows '$switch->{'status'}'. Failing the test"); } } } # Done with switch check # Check bridges foreach my $bridge (@bridges) { $log->comment("$bridge->{'name'} status: $bridge->{'status'}"); my $found_fhm_msg = 0; # check specified bridge show warning/error # check if exists $bridge->{'storage-bridge-error-list'}) ? if (exists $expected_fhm->{$bridge->{'wwn'}} and $expected_fhm->{$bridge->{'wwn'}}->{type} eq 'bridge') { if ( $bridge->{'status'} eq $expected_fhm->{$bridge->{'wwn'}}->{'status'} ) { foreach my $bridge_error ( @{${$bridge->{'storage-bridge-error-list'}}[0] ->{'storage-bridge-error-info'}} ) { $log->comment("$bridge->{'name'}". " show fhm error message: ". "$bridge_error->{'error-message'}"); # find the shown alert from given array foreach my $expected_fhm_error (@{$expected_fhm->{$bridge->{'wwn'}}->{'alert'}}) { if ( $bridge_error->{'error-message'} =~ /$expected_fhm_error/i ) { $found_fhm_msg++; $log->comment("Expected bridge fhm error found: [$bridge_error->{'error-type'}] ". "'$bridge_error->{'error-message'}'". "on $bridge->{'name'}"); } } } #looped through all error on this bridge if ($args{mode} eq 'show' and $found_fhm_msg == scalar (@{$expected_fhm->{$bridge->{'wwn'}}->{'alert'}}) ) { $bridge_complete++; } elsif ($args{mode} eq 'clear' and $found_fhm_msg == 0 ) { $bridge_complete++; } } } else { # if not specified, the status should be "ok" if ($bridge->{'status'} ne 'ok') { NATE::BaseException->throw("bridge $bridge->{'name'} ". "status should be 'ok', but shows '$bridge->{'status'}'. Failing the test"); } } } # Done with bridge check # check if all fhm found. if (($switch_complete + $bridge_complete) == scalar (keys $expected_fhm)) { return 1; } return 0; } # to check if all initial shm exists. # cannot reuse _check_shm due to duplicated entries on each node. sub _check_init_shm { my ($self, %args) = @_; my $node = $args{'node'}; my @current_shm = @{$self->_get_shm_alert(apiset=>$self->{apisets}->{$node})}; my $found_shm_alert = 0; if (exists $self->{init_shm_alert}->{$node}) { my @init_alerts = @{$self->{init_shm_alert}->{$node}}; foreach my $shm_alert (@current_shm) { $self->_print_shm($shm_alert); foreach my $expected_alert (@init_alerts) { if ($shm_alert->{'probable-cause-description'} =~ /$expected_alert->{'probable-cause-description'}/i) { $found_shm_alert++; } } } # check all shm shown if (scalar @init_alerts != 0 and $found_shm_alert == scalar @init_alerts) { return 1; } } else { # No inital alerts on this node $log->comment("$node: There should be no initial alerts. shm alerts has not been cleared yet"); if (scalar @current_shm != 0) { foreach my $shm_alert (@current_shm) { $self->_print_shm($shm_alert); } } else { return 1; } } return 0; } # check if all fhm alert shows 'ok' # passed: # $node # # return: # 1 if all 'ok' in this node. # 0 if not all 'ok' in this node. sub _check_all_fhm_ok { my ($self, %args) = @_; my $node = $args{'node'}; $log->comment("$node: Start to check all fhm ok"); my @current_fhm; my @switches = @{$self->_get_fhm_switch(apiset=>$self->{apisets}->{$node})}; my @bridges = @{$self->_get_fhm_bridge(apiset=>$self->{apisets}->{$node})}; foreach my $switch (@switches) { $log->comment("$switch->{'name'} status: $switch->{'status'}"); if ($switch->{'status'} ne 'ok') { return 0; } } # Done with switch check # Check bridges foreach my $bridge (@bridges) { $log->comment("$bridge->{'name'} status: $bridge->{'status'}"); if ($bridge->{'status'} ne 'ok') { return 0; } } # Done with bridge check return 1; } sub _print_shm { my $self = shift; my $shm_alert = shift; $log->comment("$shm_alert->{'node'} shows alert [$shm_alert->{'alert-id'}] " . "'$shm_alert->{'probable-cause-description'}' " . "on $shm_alert->{'alerting-resource-name'} "); } 1;