## Copyright (c) 2001-2017 NetApp, Inc., All Rights Reserved ## Any use, modification, or distribution is prohibited ## without prior written consent from NetApp, Inc. ## # ## @summary HA Task Module ## @author rajaram@netapp.com,dl-ha-qa@netapp.com ## @status shared ## @pod here =head1 NAME NACL::MTask::HA =head1 DESCRIPTION This task is to be used for initiating storage failover takeover and storage failover giveback, and verify that the relevent ems,asup are generated, the nodes are in appropriate states, aggr are online and disk ownerships are as expected. Apart from validating the states of the nodes after Takeover/Giveback (done by NACL::STask::StorageFailover), the task performs the following verifications additionally :: checking for appropriate ems, asup, takeover/giveback duration, disk ownership changes, aggregate states, etc. If users want to do only a basic check of the state of the nodes after a takeover/giveback then they can use the StorageFailover STask (NACL::STask::StorageFailover). This MTask will in turn be calling the StorageFailover STask methods wherever necessary # Invoking a storage failover takeover. use NACL::MTask::HA; ... my $Node1 = NACL::MTask::HA->new( command_interface => $node, ); $Node1->takeover(type=>"reboot", nacltask_verify_aggr_state_owner => 1); ... # Perform any operations here ... $Node1->giveback('override-vetoes'=> "true", 'require-partner-waiting'=> "true", nacltask_verify_ems => 1); # End test =cut =head1 ATTRIBUTES If users want to create an HA object when partner is down but the actual cross connect is present check the below example use NACL::MTask::HA; ... my $Node1 = NACL::MTask::HA->new( command_interface => $node, verify_cross_connect => "0", reset_flags => "0" ); ... # Perform any operations here ... $Node1->giveback(); # End test =head2 node The node on which the takeover/giveback will be initiated =cut package NACL::MTask::HA; use strict; use warnings; use base qw(NACL::MTask::MTask); use NATE::Log qw(log_global); my $Log = log_global(); my $may_enter = $Log->may_enter(); my $may_exit = $Log->may_exit(); use NATE::Process; use Params::Validate qw(validate validate_with BOOLEAN HASHREF OBJECT ARRAYREF SCALAR); use NACL::Exceptions::OperationFailed; use NACL::Exceptions::EventCheckFailure qw(:try); use NACL::Exceptions::UnexpectedState qw(:try); use NACL::APISet::Exceptions::ResponseException qw(:try); use NATE::Exceptions::Argument; use NACL::STask::Node; use NACL::C::StorageAggregate; use NACL::C::Volume; use NACL::STask::StorageDisk; use NACL::STask::StorageFailover; use NACL::MTask::EventLogDetector; use NACL::C::EventLog; use NACL::CS::StorageFailover; use NACL::MTask::StorageAggregateRelocation; use NACL::C::SystemServicesNtpConfig; use NACL::C::EventConfig; use NACL::MTask::SystemLogDetector; use NACL::ComponentUtils qw(Dumper); use NACL::Exceptions::EventCheckFailure qw(:try); use NACL::C::Exceptions::TimeNotSynchronised qw(:try); use NACL::APISet::Exceptions::TimeoutException qw(:try); use NACL::GeneralUtils qw(nacl_method_retry); use NACL::C::Exceptions::StorageAggregate::ZsmConnectionError qw(:try); use NATE::STAF::Peer qw($Parent); use NATE::Events qw(listener_add listener_remove eventloop); use NACL::CS::ClusterDate; use NACL::C::ClusterDate; use NACL::MTask::Exceptions::HAGatherDiagException; use Storable qw(freeze thaw); # Max duration within which a takeover or Giveback is expected to complete use constant MAX_TO_GB_DURATION => 60; use constant POLL_INTERVAL => 10; # Timeout for aggr to come online use constant AGGR_ONLINE_TIMEOUT => 40; # SFO aggregate outage timeout # burt826198 changed from 60 -> 120 use constant SFO_AGGR_ONLINE_TIMEOUT => 120; # CFO aggregate outage timeout use constant CFO_AGGR_ONLINE_TIMEOUT => 60; # After giveback node takes about 400 seconds to reach SF_UP use constant WFG_TO_SF_UP_TIMEOUT => 320; # Timeout for failover state to become 'In takeover' # after takeover operation. For FANTA, timeout is # (No. of online aggrs) * SFO_AGGR_ONLINE_TIMEOUT. # Otherwise it is TAKEOVER_TIMEOUT. use constant TAKEOVER_TIMEOUT => 120; use constant POLL_DELTA => 5; use constant MCC_IP_DETECTION_TIME => 20; use constant NON_MCC_IP_DETECTION_TIME => 15; =head1 METHODS =head2 new my $Node1 = NACL::MTask::HA->new( node => $node, ); Create a HA object that can be used to initiate takeover or giveback and do the relevent verifications. =over =item Options =over =item C<< node => $node >> (Required, isa NACL::C::CommandInterface) The command interface and node on which to initiate takeover/giveback =item C<< reset_flags => $boolean >> (Optional, Default value is 1) Setting this option will reset the HA flags to their default values upon object creation =item C<< verify_cross_connect => $boolean >> (Optional, Default value is 1) Setting this option will check for the disk shelves are cross-connected and also setting the storage failover mode to ha =back =back =cut use Class::MethodMaker [ new => [ '-hash', '-init', 'new' ], scalar => 'node', scalar => 'reset_flags', array => '_partner_online_sfo_aggrs', scalar => '_partner_cfo_aggr', scalar => '_partner_sfo_aggr_volumes', scalar => 'verify_cross_connect', scalar => [ { -type => 'NACL::MTask::StorageAggregateRelocation' }, 'arl_mtask_obj', ], ]; sub init { my ($self, %args) = @_; $self->_execute_with_gather_diag( method => 'init_executed', exception => 'NACL::MTask::Exceptions::HAGatherDiagException', method_args => \%args, ); } sub init_executed { $Log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { node => { type => OBJECT, isa => "NACL::C::Node" }, reset_flags => { type => BOOLEAN, default => 1 }, verify_cross_connect => { type => BOOLEAN, default => 1 }, } ); my %common_opts; $self->_move_common_component_params( source => \%opts, target => \%common_opts ); my $node = $opts{node}; my $partner = NACL::STask::Node->get_partner_obj( command_interface => $node ); my $node_name = $node->node(); my $partner_name = $partner->node(); my %volumes_aggr; ## If verify_cross_connect is set then verify the disk shelves are ## cross-connected and also setting the storage failover mode to ha. if ( $opts{verify_cross_connect} ) { $Log->comment( "Check if disk shelves are cross-connected to the filers"); ## Check if disk shelves are cross-connected to the filers try { NACL::STask::StorageDisk->check_cross_connect( command_interface => $node, node1 => $node_name, node2 => $partner_name, %common_opts, ); } catch NACL::APISet::Exceptions::ResponseException with { my $exception_object = shift; $Log->comment( "Failed to check cross connect, " . $exception_object->text() ); NACL::APISet::Exceptions::ResponseException->throw( "Failed to check cross connect"); }; my $check_auto_giveback = 0; unless( $opts{reset_flags} ) { my $cs = NACL::CS::StorageFailover->fetch( command_interface => $node, filter => { node => $node_name }, requested_fields => [qw(auto-giveback)], ); if($cs->auto_giveback() =~ /false/i) { $check_auto_giveback = 1; } } ## HANLR changes need to go in here ## Setting storage failover mode to ha $Log->comment("Enable Storage failover"); ## Enable storage failover NACL::STask::StorageFailover->enable( command_interface => $node, node => $node, %common_opts, ); if($check_auto_giveback) { NACL::C::StorageFailover->modify( command_interface => $node, node => $node, 'auto-giveback' => "false", ); } } ## If reset_flags is set then reset both the node and the partner nodes' ## HA flags if ( $opts{reset_flags} ) { $Log->comment("Reset HA flags"); $self->reset_HA_flags( node => "$node_name,$partner_name", ); } # Need to check 'partial_giveback' failover state. # If so, issue giveback command. my $cs = NACL::CS::StorageFailover->fetch( command_interface => $node, filter => { node => $node_name }, requested_fields => [qw(partner-state)], ); if ( $cs->partner_state() =~ m[partial_giveback]i ) { NACL::STask::StorageFailover->giveback( command_interface => $node, node => $node, partner => $partner, 'override-vetoes' => 'true', ); } # Get the list of partner aggr names my @partner_aggrs = NACL::CS::StorageAggregate->fetch( command_interface => $node, filter => { 'home-name' => $partner_name }, requested_fields => [qw (aggregate ha-policy state)], ); my @online_sfo_aggregates; foreach my $aggr (@partner_aggrs) { # Store online/non-online sfo aggregate names if ( $aggr->ha_policy() eq 'sfo' ) { if ( $aggr->state eq 'online' ) { push( @online_sfo_aggregates, $aggr->aggregate() ); } } # Store cfo aggregate name if ( $aggr->ha_policy() eq 'cfo' ) { if ( $aggr->state eq 'online' ) { $self->_partner_cfo_aggr( $aggr->aggregate() ); } } } $self->_partner_online_sfo_aggrs(@online_sfo_aggregates); $Log->debug( sub { "capturing Volume state information for all volumes in the partner aggregate list"; } ); foreach my $aggr (@online_sfo_aggregates) { my @aggr_vol = NACL::CS::Volume->fetch( command_interface => $node, filter => { aggregate => $aggr }, requested_fields => [qw(volume vserver state)], is_system_vol => 0, allow_empty => 1 ); foreach my $av (@aggr_vol) { my $volume = $av->volume(); $volumes_aggr{$aggr}{$volume}{'vserver'} = $av->vserver(); $volumes_aggr{$aggr}{$volume}{'state'} = $av->state(); } } $self->_partner_sfo_aggr_volumes( \%volumes_aggr ); NACL::C::EventConfig->modify( command_interface => $node, "suppression" => "off" ); $Log->exit() if $may_exit; } ## end sub init =head2 takeover $Node1->takeover(); (Instance method) Initiate a takeover from Node1 and verify the takeover. FANTA takeover is applicable on following cases : 1. If takeover type is takeover_command, 'bypass-optimization' is false and 'option' is normal or allow-version-mismatch. 2. If 'bypass-optimization' attribute is not defined, it checks 'bypass-takeover-optimization' field and based on it's value, sets the 'bypass-optimization' attribute. NATE::Exceptions::Argument will be thrown if 'bypass-optimization' attribute is specified for RR and BR. =over =item Options =over =item C<< 'skip-lif-migration' => true|false >> (Optional, defaults to UNDEF) If this option is defined then the value is set before issuing takeover. This is an option in "storage failover takeover" command. =item C<< type => $scalar >> ( Optional, defaults to "takeover_command" ) Possible values are "takeover_command", "panic", "reboot", "halt", "powercycle", "watchdog_reset". Currently the following options of takeover are supported - takeover_command, panic, reboot, halt. The remaining options of takeover powercycle/watchdog_reset will be implemented later. =item C<< option => $scalar >> ( Optional, defaults to "normal" ) Possible values are "immediate", "force", "allow-version-mismatch", "allow-disk-inventory-mismatch". =item C<< polling_interval => $scalar >> ( Optional, defaults to 10secs ) This is the interval (in seconds) after which the status of takeover will be polled. =item C<< method_timeout => $method_timeout >> (Optional) Time in seconds when takeover is expected to complete Default : Value used in the STask StorageFailover =item C<< takeover_expected => $boolean >> ( Optional, defaults to 1 ) Boolean value, 0 or 1. Option to indicate if takeover is expected to happen or not. The behaviour of "nacltask_verify" depends on whether a takeover is expected or not. =item C<< nacltask_inhibit_takeover => $boolean >> ( Optional, defaults to 0 ) Option to inidicate if the takeover due to reboot/halt should be inhibited =item C<< failure_reason => $scalar >> ( Optional ) The reason for which the takeover is expected to fail. Possible values are, disk_inventory_mismatch version_mismatch nvram_logs_unsynced unable_to_access_partner_mailbox_disks This option is not yet implemented. =item C<< nacltask_verify_node_sfo_state => $boolean >> (Optional, defaults to 1) If 1, then verify the state of the local node using wait_for_state() STask =item C<< nacltask_verify_partner_sfo_state => $boolean >> (Optional, defaults to 0) If 1, then verify the state of the partner node using wait_for_state() STask =item C<< nacltask_verify_aggr_state_owner => $boolean >> (Optional, defaults to 1) If 1, then verify that all the aggregates are online after takeover and also verify the ownerships of sfo/cfo aggrs. In case if the method is not able to query the aggregate and check its state after repeated polling for 60 seconds, due to the following error- "Error: show failed: Failed to get the information for aggregate {aggr}. Reason: ZSM - Can't connect to host", then an exception of type C<> is thrown. =item C<< nacltask_verify_ems => $boolean >> (Optional, defaults to 1) If 1, then verify that that the following ems messages are generated and takeover duration is within limits. =item C<< nacltask_verify_asup => $boolean >> (Optional, defaults to 0) If 1, then verify that the appropriate asup's are generated. Currently not supported. =item C<< nacltask_verify_disk_ownership => $boolean >> (Optional, defaults to 0) If 1, then verify that the current owner for disks in sfo-styled aggrs is the node. Currently not supported. =item C<< nacltask_powercycle_after_panic => $boolean >> (Optional, defaults to 0) If 1, then partner node will be powercycled when takeover_type ='panic' in STask. When the value is 0, the routine ensures the partner reaches the firmware prompt after panic in STask. =back =back =cut sub takeover { my ($self, %args) = @_; $self->_execute_with_gather_diag( method => 'takeover_executed', exception => 'NACL::MTask::Exceptions::HAGatherDiagException', method_args => \%args, ); } sub takeover_executed { $Log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { type => { type => SCALAR, default => "takeover_command" }, option => { type => SCALAR, default => "normal" }, 'skip-lif-migration'=> { type => SCALAR, optional => 1 }, 'bypass-optimization' => { type => SCALAR, optional => 1 }, polling_interval => { type => SCALAR, default => POLL_INTERVAL }, method_timeout => { type => SCALAR, optional => 1 }, takeover_expected => { type => SCALAR, default => 1 }, failure_reason => { type => SCALAR, optional => 1 }, nacltask_verify_node_sfo_state => { type => BOOLEAN, default => 1 }, nacltask_verify_partner_sfo_state => { type => BOOLEAN, default => 0 }, nacltask_verify_aggr_state_owner => { type => BOOLEAN, default => 1 }, nacltask_verify_ems => { type => BOOLEAN, default => 1 }, nacltask_inhibit_takeover => { type => BOOLEAN, default => 0 }, nacltask_powercycle_after_panic => { type => BOOLEAN, default => 0 }, nacltask_boot_after_halt => { type => BOOLEAN, default => 0 }, nacltask_log_retries => { type => SCALAR, default => 3 }, }, allow_extra => 1, ); my %common_opts; $self->_move_common_component_params( source => \%opts, target => \%common_opts ); my $node = $self->node(); my $node_name = $node->node(); my $partner = NACL::STask::Node->get_partner_obj( command_interface => $node ); my $partner_name = $partner->node(); ## Temporary fix to bypass the ems verification ( Fix will be back ported after the burt749767 is fixed ) my $ems_check = delete $opts{nacltask_verify_ems}; $ems_check = !$::BYPASS_EMS if ( defined $::BYPASS_EMS && $ems_check ); my $aggr_check = delete $opts{nacltask_verify_aggr_state_owner}; my $nacltask_log_retries = delete $opts{nacltask_log_retries}; my @online_sfo_aggregates = $self->_partner_online_sfo_aggrs(); my @partner_aggr_names = ( @online_sfo_aggregates, $self->_partner_cfo_aggr() ); my $is_fanta_takeover; my $takeover_type = $opts{type}; # store any errors we have encountered my @error; # Constructing the hash to send to STask takeover my %sfo_opts; $sfo_opts{node} = $node; $sfo_opts{partner} = $partner; $sfo_opts{takeovertype} = $opts{type}; $sfo_opts{option} = $opts{option}; $sfo_opts{nacltask_poll_interval} = $opts{polling_interval}; $sfo_opts{nacltask_inhibit_takeover} = $opts{nacltask_inhibit_takeover}; $sfo_opts{nacltask_verify} = $opts{nacltask_verify_node_sfo_state}; $sfo_opts{nacltask_check_partner_state} = $opts{nacltask_verify_partner_sfo_state}; $sfo_opts{nacltask_powercycle_after_panic} = $opts{nacltask_powercycle_after_panic}; $sfo_opts{nacltask_expect_takeover} = $opts{takeover_expected}; $sfo_opts{nacltask_boot_after_halt} = $opts{nacltask_boot_after_halt}; if ( defined $opts{method_timeout} ) { $sfo_opts{'method-timeout'} = $opts{method_timeout}; } if ( defined $opts{'skip-lif-migration'} ) { $sfo_opts{'skip-lif-migration'} = $opts{'skip-lif-migration'}; } if ( defined $opts{'bypass-optimization'} ) { $sfo_opts{'bypass-optimization'} = $opts{'bypass-optimization'}; } # Set the verification details if ( $opts{nacltask_inhibit_takeover} or !( $opts{takeover_expected} ) ) { $ems_check = 0; $aggr_check = 0; } # Construct and start the EventLogDetector object my @check_for_all_presence = ( 'cf.fm.takeoverStarted', 'cf.fm.takeoverComplete', 'cf.fm.takeoverDuration', ); if ( ( $takeover_type eq "system_powercycle" ) or ( $takeover_type eq "system_power_off" ) or ( $takeover_type eq "system_reset" ) or ( $takeover_type eq "watchdog_reset" ) ) { my @hwassist_events = ( 'cf.fsm.stateTransit', 'cf.hwassist.takeoverTrapRecv' ); push( @check_for_all_presence, @hwassist_events ); } my $system_log_detector = NACL::MTask::SystemLogDetector->new( command_interface => $node, set_log_file => ['/mroot/etc/log/ems'], nacltask_skip_nonascii_file => 0, ); $system_log_detector->start(); my $verify_fanta_ems; # If 'bypass-optimization' option isn't defined and takeover # optimization supported, check bypass-takeover-optimization # attribute's value and assign the same value to # $sfo_opts{'bypass-optimization'} parameter. ### bypass-takeover-optimization option is not supported in HN.0. BURT 1023506. #if ( !defined $sfo_opts{'bypass-optimization'} ) { # $sfo_opts{'bypass-optimization'} = NACL::CS::StorageFailover->fetch( # command_interface => $node, # filter => { node => $node }, # requested_fields => [qw(bypass-takeover-optimization)] # )->bypass_takeover_optimization(); #} # Check these conditions 1> takeover optimization isn't supported # 2> bypass-optimization is specifed as 'true'. If any one of # these condition is true, then it is non FANTA takeover, otherwise # fanta takeover $sfo_opts{'bypass-optimization'} = 'false'; my $detector = NACL::MTask::EventLogDetector->new( command_interface => $node, ); my $proc; if ( $sfo_opts{'bypass-optimization'} eq 'true' || $sfo_opts{takeovertype} ne 'takeover_command' || $sfo_opts{option} ne 'normal' || !$opts{takeover_expected} ) { # Start capturing ems logs if ($ems_check) { # mark start time for event log fetch. $detector->start(); my %args = ( 'detector' => $detector ); $proc = NATE::Process->new( codespec => sub { $self->_event_collector(@_);}, args => [ %args ], runid => 'TO_event_logs', ); # Start collecting event log. $proc->start; } # Calling the STask takeover to initiate a takeover NACL::STask::StorageFailover->takeover( command_interface => $node, %sfo_opts, %common_opts, ); # Call verify methods to validate the different # elements after takeover # Aggr check if ($aggr_check) { try { $self->_verify_aggr_state_owner( aggr_names => \@partner_aggr_names, home_name => $partner_name, owner_name => $node_name, ); } catch NACL::Exceptions::UnexpectedState with { my $exception_object = shift; push( @error, $exception_object->text() ); }; } } else { # FANTA Takeover my $arl_task_obj = NACL::MTask::StorageAggregateRelocation->new( command_interface => $node, node => $partner->node(), destination => $node->node(), aggregate_list => \@online_sfo_aggregates ); # Create aggr-vol list for arl verification $arl_task_obj->create_aggr_vol_list( 'aggregate-list' => \@online_sfo_aggregates ); my $expected_takeover_state; # Set is_fanta_takeover flag to 1. This flag will # be used for fanta ems verification. $is_fanta_takeover = 1; my $expected_partner_state = "wfg"; my $state = NACL::CS::StorageFailover->fetch( command_interface => $node, filter => { node => $node }, requested_fields => [qw(auto-giveback)] ); my $auto_giveback = $state->auto_giveback(); if ( $auto_giveback eq 'false' ) { $expected_takeover_state = "takeover"; } else { $expected_takeover_state = "takeover_autogiveback_scheduled"; } # Start capturing ems logs if ($ems_check) { #Mark begin time for event log $detector->start(); my %args = ( 'detector' => $detector ); $proc = NATE::Process->new( codespec => sub { $self->_event_collector(@_); }, args => [ %args ] , runid => 'TO_event_logs', ); #start collecting event log. $proc->start; } # This is detector object will be used to verify aggregate # temporary relocation message 'sfo.aggr.relocated.temp' $arl_task_obj->create_event_log_detector(); $self->arl_mtask_obj($arl_task_obj); # Calling the STask takeover to initiate a takeover. # Skipping failover state check, need to verify sfo # aggr relocation time. NACL::STask::StorageFailover->takeover( command_interface => $node, %common_opts, %sfo_opts, nacltask_verify => 0, nacltask_check_partner_state => 0, ); # Verify aggregate relocation. # ARL verification is mandatory for FANTA takeover, # So nacltask_verify_aggr_state_owner param value # is ignored here. Also, calculate takeover timeout # here. For fanta, # takeover_timeout = (No. of online aggrs) * SFO_AGGR_ONLINE_TIMEOUT # Otherwise it is TAKEOVER_TIMEOUT (which is 120 secs) my %method_timeout; if (@online_sfo_aggregates) { my $arl_timeout = scalar @online_sfo_aggregates * SFO_AGGR_ONLINE_TIMEOUT; $arl_task_obj->perform_arl_verifications( 'method-timeout' => $arl_timeout, nacltask_arl_type => 'TakeOver', nacltask_verify_ems => 0, ); } else { %method_timeout = ( 'method-timeout' => TAKEOVER_TIMEOUT ); } # Now check failover state. Expected state is 'In takeover'. # Takeover timeout value which is calculated in the previous # step is used here. NACL::STask::StorageFailover->wait_for_state( command_interface => $node, node => $node, partner => $partner, node_state => $expected_takeover_state, %common_opts, %method_timeout, ); $Log->comment('Takeover state is verified'); # If nacltask_check_partner_state is specified, wait for wfg state. if ( $sfo_opts{nacltask_check_partner_state} ) { NACL::STask::StorageFailover->wait_for_state( command_interface => $node, node => $node, partner => $partner, node_state => $expected_takeover_state, partner_state => $expected_partner_state, %common_opts, ); } } # Check nacltask_verify_ems if ($ems_check) { my $ems = $self->_get_result_and_stop(till_time => 120 , proc => $proc) if ( $proc->is_running() ); ## Check for the presence of ems try { $Log->comment("Check for the presence of all non FANTA ems"); my %ems = map { $_->{messagename},1 } @$ems; my @missed_ems = map { $_ } grep ( !defined $ems{$_}, @check_for_all_presence ); if ( @missed_ems ) { $Log->exit() if $may_exit; NACL::Exceptions::EventCheckFailure->throw( 'EMS Not Found : ' . Dumper( \@missed_ems ) ); } $Log->comment("Print the event messages"); my %takeover_types = map { $_ , 1 } (qw/system_powercycle system_power_off system_reset watchdog_reset/); foreach my $event ( @check_for_all_presence ) { if ( exists $takeover_types{$takeover_type} && ( ( $event eq "cf.fsm.stateTransit") or ( $event eq "cf.hwassist.takeoverTrap" ) ) ) { $self->_print_ems_messages( event_array => $ems, event => $event, takeover_type => $takeover_type, ); } else { $self->_print_ems_messages( event_array => $ems, event => $event, ); } } $Log->comment("Verify the takeover duration"); $self->_verify_takeover_giveback_duration( event_array => $ems ); # Verify fanta ems logs if ($is_fanta_takeover) { $self->_verify_fanta_ems( event_array => $ems, sfo_aggregates => \@online_sfo_aggregates, ); } } catch NACL::Exceptions::EventCheckFailure with { my $e = shift; $Log->warn( 'Event log verification failed. ' . $e->text() ); $self->_check_ems_log( detector => $system_log_detector, sfo_aggrs => \@online_sfo_aggregates, takeover_type => $takeover_type, nacltask_is_fanta => $is_fanta_takeover, nacltask_log_retries => $nacltask_log_retries ); push( @error, $e->text() ); }; } # check if there are no errors if (@error) { $Log->exit() if $may_exit; NACL::Exceptions::OperationFailed->throw( "Takeover failed: " . join( "\n", @error ) ); } $Log->exit() if $may_exit; } ## end sub takeover =head2 giveback $Node1->giveback(); (Instance method) Initiate a giveback from Node1 and verify the giveback. =over =item Options =over =item C<< giveback_expected => $boolean >> ( Optional, defaults to 1 ) If 1, then giveback is expected to complete successfully. If 0, then giveback is expected to fail due to any of the reasons mentioned in "failure_reason" The behaviour of "nacltask_verify" depends on whether a giveback is expected or not. =item C<< 'require-partner-waiting' => true | false >> ( Optional, defaults to UNDEF ) If this option is defined then the partner state is not checked before issuing givback. This is an option in "storage failover giveback" command. =item C<< 'override-vetoes' => true | false >> ( Optional, defaults to UNDEF ) If this option is defined then the giveback will be done overriding the vetoes. This is an option in "storage failover giveback" command. =item C<< 'only-cfo-aggregates' => true | false >> ( Optional, defaults to UNDEF ) If this option is defined then only the cfo styled aggregates with will given back and the node will enter a partial giveabck state. This is an option in "storage failover giveback" command =item C<< auto_giveback_expected => 0 >> (Optional) user input flag that is used to decide whether an auto giveback is expected or not. Example, if takeover happens due to partner reboot then even if the autogiveback flag is disabled, a autogiveback is expected to happen Default : 0 =item C<< polling_interval => $scalar >> ( Optional, defaults to 10secs ) This is the interval (in seconds) after which the status of giveback will be polled. =item C<< method_timeout => $method_timeout >> (Optional) Time in seconds when giveback is expected to complete Default : Value used in the STask StorageFailover =item C<< failure_reason => $scalar >> ( Optional ) The reason for which the giveback is expected to fail. Possible values are unable_to_read_partner_state disk_inventory_information_not_yet_received autosupport_vetoed failed_due_to_diskcheck =item C<< nacltask_verify_sfo_state => $boolean >> (Optional, defaults to 1) Defaults to 0 (do not perform verification). If 1, then giveback verification methods will be invoked to verify the giveback scenario. =item C<< nacltask_verify_aggr_state_owner => $boolean >> (Optional, defaults to 1) If 1, then verify that all the aggregates are online after giveback In case if the method is not able to query the aggregate and check its state after repeated polling for 60 seconds, due to the following error- "Error: show failed: Failed to get the information for aggregate {aggr}. Reason: ZSM - Can't connect to host", then an exception of type C<> is thrown. =item C<< nacltask_verify_ems => $boolean >> (Optional, defaults to 1) If 1, then verify that that the following ems messages are generated and giveback duration is within limits. =item C<< nacltask_wait_for_asup_generation => 0 >> (Optional, defaults to 0) User input flag that is used to wait if giveback is vetoed due to autosupport. =item C<< nacltask_wait_for_partner => 1 >> (Optional) user input flag that is used to wait for partner to come to wfg state before issuing a giveback Default : 1 =item C<< nacltask_verify_asup => $boolean >> (Optional, defaults to 0) If 1, then verify that the appropriate asup's are generated. Currently not supported. =item C<< nacltask_verify_disk_ownership => $boolean >> (Optional, defaults to 0) If 1, then verify that the current owner for disks in sfo-styled aggrs is the node =item C<< nacltask_verify_arl_ems => $boolean >> (Optional, defaults to 0) If 1, then verify aggregate temporary relocation ems. =back =back =cut sub giveback { my ($self, %args) = @_; $self->_execute_with_gather_diag( method => 'giveback_executed', exception => 'NACL::MTask::Exceptions::HAGatherDiagException', method_args => \%args, ); } sub giveback_executed { $Log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { giveback_expected => { type => BOOLEAN, default => 1 }, auto_giveback_expected => { type => BOOLEAN, default => 0 }, 'require-partner-waiting' => { regex => qr/^(true|false)$/, optional => 1 }, 'override-vetoes' => { regex => qr/^(true|false)$/, optional => 1 }, 'only-cfo-aggregates' => { regex => qr/^(true|false)$/, optional => 1 }, polling_interval => { type => SCALAR, default => POLL_INTERVAL }, failure_reason => { type => SCALAR, optional => 1 }, method_timeout => { type => SCALAR, optional => 1 }, nacltask_verify_sfo_state => { type => SCALAR, default => 1 }, nacltask_verify_aggr_state_owner => { type => BOOLEAN, default => 1 }, nacltask_verify_ems => { type => BOOLEAN, default => 1 }, nacltask_wait_for_asup_generation => { type => BOOLEAN, default => 0 }, nacltask_wait_for_partner => { type => BOOLEAN, default => 1 }, nacltask_verify_arl_ems => { type => BOOLEAN, default => 0 }, nacltask_skip_auto_giveback => { type => BOOLEAN, default => 0 }, }, ); my %common_opts; $self->_move_common_component_params( source => \%opts, target => \%common_opts ); $self->node()->refresh_command_interface(); my $node = $self->node(); my $node_name = $node->node(); my $partner = NACL::STask::Node->get_partner_obj( command_interface => $node ); my $partner_name = $partner->node(); ## Temporary fix to bypass the ems verification ( Fix will be back ported after the burt749767 is fixed ) my $ems_check = delete $opts{nacltask_verify_ems}; $ems_check = !$::BYPASS_EMS if ( defined $::BYPASS_EMS && $ems_check ); my $arl_ems_check = delete $opts{nacltask_verify_arl_ems}; my $aggr_check = delete $opts{nacltask_verify_aggr_state_owner}; my @partner_aggr_names = ( $self->_partner_online_sfo_aggrs, $self->_partner_cfo_aggr ); # my $detector; my @online_sfo_aggregates = $self->_partner_online_sfo_aggrs(); my $timeout; if (@online_sfo_aggregates) { $timeout = scalar @online_sfo_aggregates * SFO_AGGR_ONLINE_TIMEOUT; $timeout = $timeout + WFG_TO_SF_UP_TIMEOUT; } else { $timeout = WFG_TO_SF_UP_TIMEOUT; } # store any errors we have encountered my @error; # Constructing the hash to send to STask giveback my %sfo_opts; my @copy_opts = ( 'require-partner-waiting', 'override-vetoes', 'only-cfo-aggregates', 'auto_giveback_expected', 'nacltask_skip_auto_giveback' ); my %opts_mapping = ( nacltask_verify_sfo_state => 'nacltask_verify', failure_reason => 'failure-reason', method_timeout => 'method-timeout', polling_interval => 'nacltask_poll_interval', nacltask_wait_for_asup_generation => 'wait-for-asup-generation', nacltask_wait_for_partner => 'wait_for_partner', ); $self->_hash_copy( source => \%opts, target => \%sfo_opts, map => \%opts_mapping, copy => \@copy_opts, ); $sfo_opts{node} = $node; $sfo_opts{partner} = $partner; if ( !defined $sfo_opts{'method-timeout'} ) { $sfo_opts{'method-timeout'} = $timeout; } # Set the verification details if ( !( $opts{giveback_expected} ) or ( defined $opts{failure_reason} ) ) { $ems_check = 0; $aggr_check = 0; } # Construct and start the EventLogDetector object my @check_for_all_presence = ( 'cf.fm.givebackStarted', 'cf.fm.givebackComplete', 'cf.fm.givebackDuration' ); my $detector = NACL::MTask::EventLogDetector->new( command_interface => $node, ); $detector->start(); my $proc; if ($ems_check) { my %args = ( 'detector' => $detector ); $proc = NATE::Process->new( codespec => sub { $self->_event_collector(@_); }, args => [ %args ], runid => 'GB_event_logs', ); #start collecting event log. $proc->start; } # Calling the STask giveback to initiate a giveback # 320 seconds timeout for node to boot from wfg to # SF_UP. See Also burt603319. NACL::STask::StorageFailover->giveback( command_interface => $node, %sfo_opts, %common_opts, ); sleep(60); # Call verify methods to validate the different elements after giveback # Aggr check if ($aggr_check) { try { # if 'only-cfo-aggregates' is true then verify # that sfo aggregates are not given back if ( $opts{'only-cfo-aggregates'} eq "true" ) { $self->_verify_aggr_state_owner( aggr_names => \@partner_aggr_names, home_name => $partner_name, owner_name => $node_name, ); } else { $self->_verify_aggr_state_owner( aggr_names => \@partner_aggr_names, home_name => $partner_name, owner_name => $partner_name, ); } } catch NACL::Exceptions::UnexpectedState with { my $exception_object = shift; push( @error, $exception_object->text() ); }; } my $modify_system_date = sub { my $SystemNodeDate = NACL::CS::ClusterDate->fetch( command_interface => $node, filter => { node => $node_name } ); my $Date = $SystemNodeDate->date(); $Log->debug('Date : $Date'); my @dateandtime = split /\"|\/| |\:/, $Date; my $Zero = 0; my $month; my $day; if ( $dateandtime[1] <= 9 ) { $month = $Zero . $dateandtime[1]; } else { $month = $dateandtime[1]; } if ( $dateandtime[2] <= 9 ) { $day = $Zero . $dateandtime[2]; } else { $day = $dateandtime[2]; } NACL::C::ClusterDate->modify( command_interface => $node, dateandtime => "$dateandtime[3]$month$day$dateandtime[4]$dateandtime[5].$dateandtime[6]", ); }; if ( $ems_check || $arl_ems_check ) { $modify_system_date->(); } # EMS check if ($ems_check) { try { my @all_events; my $ems = $self->_get_result_and_stop( proc => $proc) if ( $proc->is_running() ); my %ems = map { $_->{messagename},1 } @$ems; my @missed_ems = map { $_ } grep ( !defined $ems{$_}, @check_for_all_presence ); if ( @missed_ems ) { $Log->exit() if $may_exit; NACL::Exceptions::EventCheckFailure->throw( 'EMS Not Found : ' . Dumper( \@missed_ems ) ); } foreach my $messagename ( @check_for_all_presence ) { print Dumper ($messagename ); $self->_print_ems_messages( event_array => $ems, event => $messagename, ); } $Log->comment("Verify the giveback duration"); $self->_verify_takeover_giveback_duration( event_array => $ems ); } catch NACL::Exceptions::EventCheckFailure with { my $exception_object = shift; push( @error, $exception_object->text() ); }; } # Verify aggregate temporary relocation ems if ( $arl_ems_check && defined $self->arl_mtask_obj() ) { $self->arl_mtask_obj()->verify_fanta_to_ems_messages(); } $Log->exit() if $may_exit; # check if there are no errors if (@error) { $Log->exit() if $may_exit; NACL::Exceptions::OperationFailed->throw( "Giveback failed: " . join( "\n", @error ) ); } $Log->exit() if $may_exit; } ## end sub giveback =head2 reset_HA_flags $HA1->reset_HA_flags(node => $node); (Instance method) Method to reset the following HA related flags to their default values This method is called by default on creating a instance of this package 'auto-giveback' 'check-partner' 'detection-time' 'onfailure' 'onpanic' 'onreboot' 'abort-operations' 'delay-seconds' =over =item Options =over =item C<< node => $node >> (Required, isa NACL::C::CommandInterface) The node on which to reset the HA flags. =back =back =cut sub reset_HA_flags { my ($self, %args) = @_; $self->_execute_with_gather_diag( method => 'reset_HA_flags_executed', exception => 'NACL::MTask::Exceptions::HAGatherDiagException', method_args => \%args, ); } sub reset_HA_flags_executed { $Log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { node => { type => SCALAR }, } ); my %common_opts; $self->_move_common_component_params( source => \%opts, target => \%common_opts ); my $detection_time; $Log->comment("Verify setup is MCC_IP or not"); my $node = $self->node(); my $mcc_ip = $node->apiset()->execute_raw_command(command => "metrocluster show -fields configuration-type"); if ($mcc_ip =~ /IP-fabric/) { $detection_time = MCC_IP_DETECTION_TIME ; } else { $detection_time = NON_MCC_IP_DETECTION_TIME ; } try { NACL::C::StorageFailover->modify( command_interface => $self->node(), node => $opts{node}, 'auto-giveback' => "true", 'detection-time' => $detection_time, 'onfailure' => "true", 'onpanic' => "true", 'onreboot' => "true", 'delay-seconds' => "600", 'method-timeout' => '180', %common_opts, ); } catch NACL::APISet::Exceptions::ResponseException with { my $exception_object = shift; $Log->comment( "Failed to reset the HA flags, " . $exception_object->text() ); NACL::APISet::Exceptions::ResponseException->throw( "Failed to reset HA flags"); }; $Log->exit() if $may_exit; } ## end sub reset_HA_flags ## Method to verify aggregates' state is online and ownership of the aggregates sub _verify_aggr_state_owner() { $Log->enter() if $may_enter; my ( $self, @args ) = @_; my %opts = validate_with( params => \@args, spec => { aggr_names => { type => ARRAYREF }, home_name => { type => SCALAR }, owner_name => { type => SCALAR }, } ); my $destination = $opts{owner_name}; my $home_name = $opts{home_name}; my %common_opts; $self->_move_common_component_params( source => \%opts, target => \%common_opts ); my $timeout = AGGR_ONLINE_TIMEOUT; my @unexpected_results; my @aggregate_list = $self->_partner_online_sfo_aggrs(); my $aggregate_vol_list = $self->_partner_sfo_aggr_volumes(); my $node = $self->node(); $Log->comment( "This is the sfo aggregate_list " . Dumper( \@aggregate_list ) ); my %aggr_vol = %$aggregate_vol_list; foreach my $aggregate (@aggregate_list) { $Log->comment("This is the aggregate $aggregate "); try { $Log->comment( "Verify that state=online, owner-name=$destination " . "home-name=$home_name, for aggr $aggregate" ); my $ver_mgr = $node->get_version_manager(); my $build_type = $ver_mgr->get_version_attribute( attribute => "build_options" ); # Type of build Eg: x86_64.debug.gcov $Log->comment("build_type is : $build_type"); $Log->comment("timeout is : $timeout"); if ($build_type =~ /gcov/i ) { $timeout = 6 * $timeout; $Log->comment("timeout is setting for gcov: $timeout"); } else { $Log->comment("timeout is setting for non-gcov: $timeout"); } nacl_method_retry( code => sub { NACL::C::StorageAggregate->wait_on_attributes( aggregate => $aggregate, command_interface => $node, attributes => [ { attribute_to_check => 'state', till_value => 'online', }, { attribute_to_check => 'owner-name', till_value => $destination, }, { attribute_to_check => 'home-name', till_value => $home_name, } ], 'method-timeout' => $timeout ); }, tries_count => 6, sleep_time => 10, exceptions => 'NACL::C::Exceptions::StorageAggregate::ZsmConnectionError' ); if ( defined $aggr_vol{$aggregate} ) { my %vol_details = %{ $aggr_vol{$aggregate} }; while ( my ( $key, $value ) = each %vol_details ) { $Log->comment( "Verifying that state=" . $value->{state} . " for volume $key in aggr $aggregate" ); $self->_check_volume_state( volume => $key, vserver => $value->{'vserver'}, aggregate => $aggregate, state => $value->{'state'}, 'method-timeout' => $timeout, polling_interval => POLL_INTERVAL, ); } } } catch NACL::Exceptions::Timeout with { my $exception = shift; $Log->debug( sub { "Exception text received in _verify_aggr_state_owner " . $exception->text(); } ); my $error_message = "After waiting for $timeout seconds the relocated " . "Aggregate attributes\nowner_name,home_name,state," . "volumes' state do not have expected values for " . "aggregate $aggregate"; $Log->debug( sub { $error_message } ); my $exception_member->{'aggregate'} = $aggregate; $exception_member->{'text'} = $exception->text(); push @unexpected_results, $exception_member; }; } ## foreach ends here if ( scalar @unexpected_results ) { my $text = join "\n", map { "$_->{aggregate} : $_->{text}" } @unexpected_results; $Log->exit() if $may_exit; NACL::Exceptions::UnexpectedState->throw( "List of failures in the Aggregate verification are as follows\n$text\n", unexpected_results => \@unexpected_results ); } ### Verify CFO aggr ownership and state. my $cfo_aggr = $self->_partner_cfo_aggr(); my ($state, $home, $owner); my $end_time = time() + CFO_AGGR_ONLINE_TIMEOUT; while ( time() < $end_time ) { $Log->comment("Inside the loop to check the aggr state"); my $aggr_obj = NACL::CS::StorageAggregate->fetch( command_interface => $self->node(), filter => { aggregate => $cfo_aggr }, requested_fields => [qw(aggregate state home-name owner-name)], %common_opts, ); $home = $aggr_obj->home_name(); $owner = $aggr_obj->owner_name(); if(($home eq "-") or ($owner eq "-")){ sleep(10); }else{ $state = $aggr_obj->state(); last; } } ## verify ownership of cfo aggr if ( ( $owner ne $opts{home_name} ) or ( $home ne $opts{home_name} ) ) { $Log->exit() if $may_exit; NACL::Exceptions::UnexpectedState->throw( "Owner of aggr $cfo_aggr not as expected, " . "Owner: Actual - $owner, Expected - $opts{owner_name}, " . "Home: Actual - $home, Expected - $opts{home_name}, " ); } ## verify whether aggr is online if ( $state ne "online" ) { $Log->exit() if $may_exit; NACL::Exceptions::UnexpectedState->throw( "Aggr $cfo_aggr is not online"); } $Log->exit() if $may_exit; } ## end sub _verify_aggr_state_owner() ## Helper method to print the 'cf' ems messages in EventLogDetector object sub _print_ems_messages { $Log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { event_array => { type => ARRAYREF }, event => { type => SCALAR }, takeover_type => { type => SCALAR, optional => 1 }, } ); my %common_opts; $self->_move_common_component_params( source => \%opts, target => \%common_opts ); my $event_array = $opts{event_array}; my $event = $opts{event}; my $takeover_type = $opts{takeover_type}; foreach my $hash (@$event_array) { my $name = $hash->{messagename}; if ( $name eq $event ) { my $ems = $hash->{event}; $Log->comment("EMS : $ems is present"); my $hwassist_ems = "cf.hwassist.takeoverTrapRecv"; #Checking if the ems is hwassist ems, and printing the messages if ( $ems =~ /$hwassist_ems/ ) { if ( ( $ems =~ /power_cycle_via_/ ) and ( $takeover_type = "system_powercycle" ) ) { $Log->comment( "Received trap message: $ems with power_cycle_via_sp/rlm" ); } elsif ( ( $ems =~ /power_off_via_/ ) and ( $takeover_type = "system_power_off" ) ) { $Log->comment( "Received trap message: $ems with power_off_via_sp/rlm" ); } elsif ( ( $ems =~ /reset_/ ) and ( $takeover_type = "system_reset" ) ) { $Log->comment( "Received trap message: $ems with reset_via_sp/rlm"); } elsif ( ( $ems =~ /watchdog_reset/ ) and ( $takeover_type = "watchdog_reset" ) ) { $Log->comment( "Received trap message: $ems with watchdog_reset"); } else { $Log->exit() if $may_exit; NACL::Exceptions::EventCheckFailure->throw( "EMS $ems is not having the correct trap event"); } } last; } } $Log->exit() if $may_exit; } ## end sub _print_ems_messages ## Helper method to get the takeover/giveback duration from the ems msgs sub _verify_takeover_giveback_duration { $Log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { event_array => { type => ARRAYREF } } ); my $verify = 0; my $event_array = $opts{event_array}; #my $events = $detector->events(); foreach my $event ( @{$event_array} ) { my $event_name = $event->{event}; if ( $event_name =~ /Duration/ ) { $verify = 1; my @tmp = split( /\s+/, $event_name ); my $duration = $tmp[7]; if ( $duration > MAX_TO_GB_DURATION ) { $Log->warn( " $tmp[3] duration is greater than 60 seconds : Actual Duration = $duration" ); } else { $Log->comment("$tmp[3] has completed in $duration seconds"); } } } if ( !$verify ) { $Log->exit() if $may_exit; NACL::Exceptions::EventCheckFailure->throw( "takeover/giveback duration ems not found"); } $Log->exit() if $may_exit; } ## end sub _verify_takeover_giveback_duration sub _verify_fanta_ems { $Log->enter() if $may_enter; my $self = shift; my %opts = validate_with( params => \@_, spec => { event_array => { type => ARRAYREF }, sfo_aggregates => { type => ARRAYREF }, } ); my $version_manager = $self->node->get_version_manager(); my $modelinfo = $version_manager->get_version_attribute( attribute => 'modelinfo' ); my $event_names; my @sfo_msgs; my @non_sfo_msgs; if ( $modelinfo =~ m[SIMBOX]i ) { $event_names = "ha.takeover*,sfo.takeover*"; @sfo_msgs = ( 'ha.takeover.stateChng', 'sfo.takeover.sfoStart', 'sfo.takeover.relocDone' ); @non_sfo_msgs = ('sfo.takeover.bypassed'); } else { $event_names = "ha.takeover*,sfo.takeover*,cf.transition*"; @sfo_msgs = ( 'ha.takeover.stateChng', 'sfo.takeover.sfoStart', 'sfo.takeover.relocDone', 'cf.transition.summary' ); @non_sfo_msgs = ( 'sfo.takeover.bypassed', 'cf.transition.summary' ); } my @events = ( @sfo_msgs , @non_sfo_msgs ); my $event_array = $opts{event_array}; foreach my $event ( @events ) { $self->_print_ems_messages( event_array => $event_array, event => $event ); } my %ems_messages = ( sfo_aggrs => \@sfo_msgs, no_sfo_aggr => \@non_sfo_msgs ); my @aggrs = @{ $opts{sfo_aggregates} }; my $ems_key = @aggrs ? 'sfo_aggrs' : 'no_sfo_aggr'; my $error = 0; my $verify_failed = 0; my $error_msgs = undef; my $matched_event_buffer = undef; foreach my $msg ( @{ $ems_messages{$ems_key} } ) { $error = 1; foreach my $event ( @{$event_array} ) { my $event_name = $event->{event}; if ( $event_name =~ m[$msg] ) { $matched_event_buffer .= $event_name . "\n"; $error = 0; } } if ($error) { $error_msgs .= "$msg ems not found\n"; $verify_failed = 1; } } if ($verify_failed) { $Log->exit() if $may_exit; NACL::Exceptions::EventCheckFailure->throw($error_msgs); } $Log->exit() if $may_exit; } ## end sub _verify_fanta_ems sub _check_volume_state { $Log->enter() if $may_enter; my $self = shift; # validate the parameters my %opts = validate_with( params => \@_, spec => { aggregate => { type => SCALAR }, volume => { type => SCALAR }, vserver => { type => SCALAR }, state => { type => SCALAR, default => 'online' }, 'method-timeout' => { type => SCALAR }, polling_interval => { type => SCALAR }, }, ); my $volume = delete $opts{volume}; my $vserver = delete $opts{vserver}; my $aggregate = delete $opts{aggregate}; my $state = delete $opts{state}; my $timeout = delete $opts{'method-timeout'}; my $polling_interval = delete $opts{polling_interval}; # query NACL::CS::Volume to see status. my $found = 1; my $end_time = time() + $timeout; my $vol_cs_obj; while ($found) { try { $vol_cs_obj = NACL::CS::Volume->fetch( command_interface => $self->node(), requested_fields => ['state'], is_system_vol => 0, filter => { volume => $volume, vserver => $vserver, aggregate => $aggregate } ); if ( $vol_cs_obj->state ne $state ) { my $vol_obj = $vol_cs_obj->get_component_instance(); $vol_obj->wait_on_attribute( 'method-timeout' => $timeout, attribute_to_check => "state", till_value => [$state], polling_interval => $polling_interval ); } $found = 0; } ## end try catch NACL::Exceptions::NoElementsFound with { # When volume move operation is complete # then the volume entry is removed from # "volume move status" Tharn::snooze(POLL_DELTA); }; if ( time() > $end_time ) { $Log->exit() if $may_exit; NACL::Exceptions::Timeout->throw( "After waiting for $timeout seconds 'state' of volume $volume " . " in aggregate $aggregate is not expected $state " ); } } ## end while ($found) $Log->exit() if $may_exit; } ## end sub _check_volume_state sub _check_ems_log { $Log->enter() if $may_enter; my ( $self, @args ) = @_; my %opts = validate_with( params => \@args, spec => { detector => { type => OBJECT, isa => 'NACL::MTask::SystemLogDetector' }, sfo_aggrs => { type => ARRAYREF }, takeover_type => { type => SCALAR }, nacltask_log_retries => { type => SCALAR, optional => 1 }, nacltask_poll_interval => { type => SCALAR, optional => 1 }, nacltask_is_fanta => { type => BOOLEAN }, }, allow_extra => 1 ); my $retries = delete $opts{nacltask_log_retires} || 3; my $poll_interval = delete $opts{nacltask_poll_interval} || 10; my $is_fanta = delete $opts{nacltask_is_fanta}; my $aggrs = delete $opts{sfo_aggrs}; my $detector = delete $opts{detector}; my $takeover_type = delete $opts{takeover_type}; my %fanta_ems = ( SIMBOX => { sfo_aggr_ems => [ 'ha.takeover.stateChng', 'sfo.takeover.sfoStart', 'sfo.takeover.relocDone' ], non_sfo_aggr_ems => ['sfo.takeover.bypassed'], }, HARDWARE => { sfo_aggr_ems => [ 'ha.takeover.stateChng', 'sfo.takeover.sfoStart', 'sfo.takeover.relocDone', 'cf.transition.summary' ], non_sfo_aggr_ems => [ 'sfo.takeover.bypassed', 'cf.transition.summary' ], }, ); my %hwassist_ems = ( system_powercycle => 'power_cycle_via_', system_power_off => 'power_off_via_', system_reset => 'reset_', watchdog_reset => 'watchdog_reset', ); my @common_ems = ( 'cf.fm.takeoverStarted', 'cf.fm.takeoverDuration', 'cf.fm.takeoverComplete' ); my $hwassist_flag = 0; my @ems_array = (); if ($is_fanta) { my $version_manager = $self->node->get_version_manager(); my $modelinfo = $version_manager->get_version_attribute( attribute => 'modelinfo' ); my $model = $modelinfo =~ m[simbox]i ? 'SIMBOX' : 'HARDWARE'; my $sfo_aggr = @$aggrs ? 'sfo_aggr_ems' : 'non_sfo_aggr_ems'; push @ems_array, @{ $fanta_ems{$model}->{$sfo_aggr} }; } if ( $takeover_type eq 'system_powercycle' || $takeover_type eq 'system_power_off' || $takeover_type eq 'system_reset' || $takeover_type eq 'watchdog_reset' ) { $hwassist_flag = 1; push @ems_array, 'cf.hwassist.takeoverTrapRecv'; push @ems_array, 'cf.fsm.stateTransit'; } push @ems_array, @common_ems; my $found = 1; my $buffer = undef; my @missing_ems = (); my $msgs = undef; while ($retries) { try { $Log->debug("Retry is: $retries"); $msgs = $detector->get_logs( skip_cached_logs => 1, command_output_level => 'trace' ); } catch NACL::APISet::Exceptions::TimeoutException with { my $exception = shift; $msgs=$exception->output(); $detector->nacltask_timeout($detector->nacltask_timeout + 600); }; $found = 1; $buffer = undef; @missing_ems = (); foreach my $ems (@ems_array) { $ems =~ s/\./_/g; if ( $msgs =~ m[$ems]i ) { $found = 0 if ( $hwassist_flag && $msgs !~ m[$hwassist_ems{$takeover_type}]i ); $buffer .= "$1\n\n" while ( $msgs =~ m[<(LR.*?$ems.*?)/>]gsi ); } else { $found = 0; push @missing_ems, $ems; } } last if ( $retries && $found ); $retries--; Tharn::snooze $poll_interval; } if ( !$retries ) { $Log->exit() if $may_exit; NACL::Exceptions::EventCheckFailure->throw( 'EMS Not Found : ' . Dumper( \@missing_ems ) ); } $Log->trace( '-------------EMS logs----------------' . "\n" . $buffer ); $Log->exit() if $may_exit; } sub _event_collector { my ( $self, @args ) = @_; my %opts = validate_with( params => \@args, spec => { detector => { type => OBJECT, isa => 'NACL::MTask::EventLogDetector' }, }, allow_extra => 1 ); my $detector = $opts{detector}; my @all_events = (); my $listener; my $node = $self->node(); my $node_name = $node->node(); ## This will be called when Parent sends stop_process signal. ## When stop_process is called the result is sent through IPC ## the return message is marked by 'transfer_result' message. my $callback_sub_ref = sub { $Log->comment("Sending message to Parent"); $Parent->message_put(type => "transfer_result", message => \@all_events); listener_remove($listener); }; $listener = { filter => sub{($_[0]->{type} eq "stop_process")}, callback => $callback_sub_ref }; listener_add($listener); #add events for TO & GB too. my $master_event_list = [ qr/^cf\.fsm\.state.*/, qr/^cf\.hwassist\.takeoverTrap.*/, qr/^ha\.takeover.*/, qr/^sfo\.takeover.*/, qr/^cf\.transition.*/, qr/^cf\.fsm\.state.*/, qr/^cf\.hwassist\.takeoverTrap.*/, 'ha.takeover.stateChng', 'sfo.takeover.sfoStart', 'sfo.takeover.relocDone', 'sfo.takeover.bypassed', 'cf.fm.takeoverStarted', 'cf.fm.takeoverComplete', 'cf.fm.takeoverDuration', 'cf.fsm.stateTransit', 'cf.hwassist.takeoverTrapRecv', 'cf.fm.givebackStarted', 'cf.fm.givebackComplete', 'cf.fm.givebackDuration' ]; while (1) { try { my @events=$detector->stop( node => $node_name, check_for_all_presence => $master_event_list, ignore_if_unsynchronized => 1); # Control will never come here... but just in case... my @messages = map { { messagename => $_->messagename(), time => $_->time(), event => $_->event(), seqnum => $_->seqnum(), } } @events ; #Sending object like following does not work in IPC. push (@all_events, @messages); } catch NACL::Exceptions::EventCheckFailure with { # get list of events that are not yet found. # It is unlikely that all events will be found # Because each 'takeover' type is different & this # Master list contains all the events for all takeover type. # But this is OK. All we are interested is to find remaining # events. my $exception = shift; my $matched_event_arr_ref = $exception->matched_events(); if ($matched_event_arr_ref && @$matched_event_arr_ref) { $Log->comment("matched events: ", Dumper($matched_event_arr_ref)); my @messages = map { { messagename => $_->messagename(), time => $_->time(), event => $_->event(), seqnum => $_->seqnum(), } } @$matched_event_arr_ref ; #Sending object like following does not work in IPC. push (@all_events, @messages); } # reduce master event list to what is not discovered yet. if ($exception->unmatched_events()) { $master_event_list = $exception->unmatched_events(); $Log->comment("**master event list**: @$master_event_list"); } }; #reset begin time for event log fetch. $detector->event_begin_time($detector->event_end_time()); eventloop(seconds => 2); } } sub _get_result_and_stop { my ( $self , @args ) = @_; my %opts = validate_with( params => \@args, spec => { till_time => { type => SCALAR, optional => 1 }, proc => { type =>OBJECT , isa => 'NATE::Process' }, }, allow_extra => 1, ); my $till_time = $opts{till_time}; my $proc = $opts{proc}; my $ems; if ($till_time ) { ## Wait for '$till_time' seconds for ems generation and stop the process sleep($till_time); } # send the stop signal to child proces. $proc->message_put(type => "stop_process", message => "",); # get result from child process. $ems = $proc->message_get(type => "transfer_result"); #$result structure # reference to Array of hashes. # [ # { # 'time' => '"11/8/2013 10:28:30"', # 'seqnum' => '1798', # 'messagename' => 'license.db.migrate.vol.success', # 'event' => '"license.db.migrate.vol.success: Successfully migrated licenses for volume component_flex_aggr "' # }, # { # 'time' => '"11/8/2013 10:58:00"', # 'seqnum' => '1882', # 'messagename' => 'raid.vol.disk.add.done', # 'event' => '"raid.vol.disk.add.done: Addition of Disk /component_flex_aggr/plex0/rg0/0c.4 Shelf - Bay - [VMware Virtualdisk 1.0 ] S/N [6000c296fbe535c9cf355033bbb01210] to aggregate component_flex_aggr has completed successfully "' # }, # ] $proc->stop; $proc->destroy; return $ems; } sub _execute_with_gather_diag { my ($self, @opts) = @_; my %opts = validate_with( params => \@opts, spec => { method => { type => SCALAR }, exception => { type => SCALAR, }, method_args => { type => HASHREF, optional => 1}, }, ); my $method = $opts{method}; my $exception = $opts{exception}; my %sfo_args; if (defined $opts{method_args}) { %sfo_args = %{$opts{method_args}}; } try { $self->$method(%sfo_args); } otherwise { my $ex = shift(); my $exception_obj = thaw(freeze($ex)); # Reblessing $exception to be of type $exception_obj $exception->convert(exception => $exception_obj); $sfo_args{command_interface} = $self->node(); $exception_obj->sfo_args(%sfo_args); $exception_obj->make_base_of($ex); $ex->throw(); }; } 1;