# -*- perl -*- ## @summary Reboot/Power cycle for no of cycles and check storage system ## consistency via cluster cli for storage interface ## ## @description This test is to verify both storage system state in Cmode ## and cluster cli storage interface in each cycle ## ## @param NODES Required: Hostspec of all the filer in a cluster ## Ex: ntest -hw NODES=svtl-3170-5,svtl-3170-6 storagev2/ui/cluster_cli_reboot_power.thpl ## ## @param FILER_RPOWER Required: rpower entry for the given filer. ## NOTE: MANDATORY PARAM if param ACTION=power is specified ## Example : FILER_RPOWER=stor-2220-3a-3b.lab.netapp.com ## ## @param FILER_RPOWER_FULL_PATH optional : rpower full path for the given filer ## Default : /usr/software/utils/bin/rpower ## ## @param CLIENT optional: CLIENT from which rpower command has to be executed ## Default: Will executes on the CLIENT where the testcase is executed ## ## @param CYCLE optional: Number of cycles,default 10 ## ## @param SLEEP optional: Sleep after performing rpower, default 600 Sec ## ## @param INTERVAL optional: Sleep time after completing each CYCLE. (removed) ## ## @param ACTION optional: reboot or power, default reboot. ## ## @param RUN_ALL_NODES optional: if yes it runs on all nodes in the cluster sequential,if no it select random node, default no. ## ## @synopsis ## storagev2/ui/cluster_cli_reboot_power.thpl ## ## @dependencies ## These tests can be run in only Cmode ## ## @status Design ## ## @author sreedhar@netapp.com ## ## @examples ## ## VARIANT=run_reboot_all_nodes storagev2/ui/cluster_cli_reboot_power.thpl ## RUN_ALL_NODES=yes ## VARIANT=run_power_cycle storagev2/ui/cluster_cli_reboot_power.thpl ## ACTION=power ## ## @keywords 10mode,GX,reboot,power cycle,Cmode ## ## @step Initialize Server object. ## @step Set up AUTOBOOT to true,based on RUN_ALL_NODES param it sets on ## all nodes. ## @step Get list of spare,data,failed,broken,reconstruction,copy disks,based ## on RUN_ALL_NODES ## @step Reboot or power cycle based param ACTION with inhibit storage failover ## @step Get list of spare,data,failed,broken,reconstruction,copy disks ## @step Compare list of disks before and after reboot. ## @step Test fails if any difference ## ## @change 08-30-13 jolie@netapp.com: was duplicating count of broken and present drives. See BURT 743207 for explanation. ## @change 11-28-13 nandeesh@netapp.com: Implemented the testcase in NACL from Server(still retained in some cases), see BURT 743207 ## @change 11-28-13 nandeesh@netapp.com: Replaced FILER and FILERB required param by NODES, see the syntax in param section of tharndoc ## @change 10-13-14 nandeesh@netapp.com: Removed Server Objects, added NACL libraries, see burt #856171 ## @change 10-13-14 nandeesh@netapp.com: Added a Required param FILER_RPOWER and Optional param's FILER_RPOWER_FULL_PATH and ## CLIENT, see burt #856171 ## @change 06-22-16 asupriya@netapp.com: Modified the script to rpower more than one node. See burt 1013716 ## @change 11-08-16 khanh@netapp.com: Added code to wait for filer boot to CLI state, before restart new cycle ## (fix for issue 'Failed to reconnect after powercycle') ## @change 04-20-2018 shailys@netapp.com: Provided fix for the burt1155200 ################################################################ use strict; use warnings; use NACL::Transit; use NACL::C::Node; use NACL::MTask::HA; use NACL::STask::StorageFailover; use NACL::STask::Node; use NACL::C::Node; use NACL::C::Client; use NACL::CS::StorageDisk; use NACL::CS::StorageFailover; use Sys::Hostname; use Data::Dumper; use vars ( '$NODES', '$CYCLE', '$ACTION', '$RUN_ALL_NODES', '$SLEEP', '$CLIENT', '$FILER_RPOWER', '$FILER_RPOWER_FULL_PATH' ); my ( $ApiSetObj, $Client_obj, $ApiSetObjClient, $transit_obj ); param( 'NODES', -mesg, 'hostspec for all the NODES are required' ); param( 'CYCLE', -default, '10' ); param( 'SLEEP', -default, '600' ); param( 'ACTION', -default, 'reboot' ); param( 'RUN_ALL_NODES', -default, 'no' ); param( 'FILER_RPOWER', -default, undef ); param( 'FILER_RPOWER_FULL_PATH', -default, '/usr/software/utils/bin/rpower' ); param( 'CLIENT', -default, '0' ); my @nodes = split( /,|\s+/, $NODES ); my ( @spare_drive_before, @spare_drive_after, @broken_drive_before, @broken_drive_after, @data_drive_before, @data_drive_after, @removed_drive_after, @removed_drive_before, @disks, @runnable_nodes, ); if ( ( $ACTION eq "power" ) && ( !defined $FILER_RPOWER ) ) { logresult( 'FATAL', "Please pass Required param FILER_RPOWER if ACTION=power is passed" ); } # Get random node my $random_server = $nodes[ int rand($#nodes) ]; if ( $RUN_ALL_NODES =~ /yes/ ) { foreach my $n (@nodes) { push( @runnable_nodes, $n ); } } else { push( @runnable_nodes, $random_server ); } if ($CLIENT) { $Client_obj = NACL::C::Client->find(); } else { my $client = hostname(); my @clienta = split( /\./, $client ); $CLIENT = $clienta[0]; $Client_obj = NACL::C::Client->new( client => $CLIENT ); } ## end else [ if ($CLIENT) ] # Create Client CLI object to access Client API's. END init. try { # Create CLI object logcomment("Create client CLI object"); my $conn = connect("$CLIENT#ssh"); $ApiSetObjClient = NACL::APISet->new( category => "Host", interface => "CLI", connection => $conn, set => "Linux" ); } catch NACL::APISet::Exceptions::TimeoutException with { my $exception_object = shift; logcomment( "Caught a " . ref($exception_object) . "!" ); logcomment( "Error is: " . $exception_object->text() ); logresult( 'FATAL', "failure found is " . Dumper $exception_object->text() ); }; foreach my $n (@runnable_nodes) { logcomment("Setting AUTOBOOT on node $n"); my $Local_Node_Object = NACL::C::Node->new( name => $n ); my $Local_Node_Name = $Local_Node_Object->name(); $ApiSetObj = $Local_Node_Object->get_7m_or_nodescope_apiset(); $ApiSetObj->bootargs_set( arg => "AUTOBOOT", value => "true" ); logcomment("AUTOBOOT set to True - $Local_Node_Name"); } foreach my $n (@runnable_nodes) { logcomment("The node selected for this test is $n"); my $Local_Node_Object = NACL::C::Node->new( name => $n ); my $Local_Node_Name = $Local_Node_Object->name(); $ApiSetObj = $Local_Node_Object->get_7m_or_nodescope_apiset(); # Get list of disks [SPARE,BROKEN,FAILED,PRESENT] my @disk_a; my @spare_drive_before = (); my @data_drive_before = (); my @broken_drive_before = (); my @removed_drive_before = (); my @drives_to_be = NACL::CS::StorageDisk->fetch( command_interface => $Local_Node_Object, 'requested_fields' => [ 'container-type', 'disk' ], ); foreach (@drives_to_be) { if ( $_->container_type() =~ /spare/i ) { push( @spare_drive_before, $_->{"disk"} ); } elsif ( $_->container_type() =~ /aggregate/i ) { push( @data_drive_before, $_->{"disk"} ); } elsif ( $_->container_type() =~ /broken/i ) { push( @broken_drive_before, $_->{"disk"} ); } elsif ( $_->container_type() =~ /unassigned/i ) { push( @removed_drive_before, $_->{"disk"} ); } } logcomment("List of broken drives : @broken_drive_before"); logcomment("List of removed drives : @removed_drive_before"); logcomment("List of spare drives : @spare_drive_before"); logcomment("List of data drives : @data_drive_before"); my ( $spare_count_before, $data_count_before, $broken_count_before, $removed_count_before ); $spare_count_before = @spare_drive_before; $data_count_before = @data_drive_before; $broken_count_before = @broken_drive_before; $removed_count_before = @removed_drive_before; logcomment("spare drives are : $spare_count_before and data drives are : $data_count_before and broken : $broken_count_before and removed : $removed_count_before"); # Disable SFO logcomment('Disabling SFO'); my $Failover_Flag; logcomment('Verifying the cluster configuration'); my $node_state = NACL::CS::StorageFailover->fetch( command_interface => $Local_Node_Object, filter => {'enabled'} ); my $node_state_1 = $node_state->enabled(); if ( $node_state_1 eq "true" ) { $Failover_Flag = 1; NACL::STask::StorageFailover->disable( command_interface => $Local_Node_Object, node => $Local_Node_Object, ); } else { logcomment("The Storage failover is already disabled"); } $transit_obj = NACL::Transit->new( name => $n, 'timeout' => 1800 ); for my $i ( 0 .. $CYCLE ) { $Local_Node_Object->refresh_command_interface( max_reconnect => 5, max_reconnect_timewait => 240 ); #Trying to reconnect if the session has expired if ( !$ApiSetObj->api_is_session_alive() ) { logcomment("session got expired,trying to reconnect"); $ApiSetObj->api_reconnect_session( max_reconnect => 5, max_reconnect_timewait => 240 ); } logcomment("$i Cycle started"); if ( $ACTION eq 'reboot' ) { # reboot the filer logcomment("Reboot the node $n"); $transit_obj->reboot( 'timeout' => 1800, 'restore_system_config' => 0 ); # added by khanh: bring node up before reconnect my $curr_state = $transit_obj->get_state(); logcomment( "Filer state: " . $curr_state . ", required state is 'CLI'" ); if ( $curr_state ne 'CLI' ) { $curr_state = $transit_obj->change_state( to => "CLI", timeout => 900 ); logcomment( "Filer state: " . $curr_state ); } #Trying to reconnect if the session has expired $Local_Node_Object->refresh_command_interface( max_reconnect => 5, max_reconnect_timewait => 240 ); #sleep($SLEEP); } else { # set environment varaible AUTOBOOT to true $ApiSetObj->bootargs_set( arg => "AUTOBOOT", value => "true" ); my @nodes = split( /,|\s+/, $FILER_RPOWER ); foreach my $node (@nodes) { try { $ApiSetObjClient->rpower( 'devicename' => $node, tool_full_path => $FILER_RPOWER_FULL_PATH, 'off' => 1, 'connectrec-timeout' => 1200 ); } catch NACL::APISet::Exceptions::ResponseException with { my $exception_object = shift; $Log->log( "Caught a " . ref($exception_object) . "!" ); $Log->log( "Error is: " . $exception_object->text() ); logresult( 'FATAL', "failure found is " . Dumper $exception_object->text() ); }; sleep 20; } foreach my $node (@nodes) { try { $ApiSetObjClient->rpower( 'devicename' => $node, tool_full_path => $FILER_RPOWER_FULL_PATH, 'on' => 1, 'connectrec-timeout' => 1200 ); } catch NACL::APISet::Exceptions::ResponseException with { my $exception_object = shift; $Log->log( "Caught a " . ref($exception_object) . "!" ); $Log->log( "Error is: " . $exception_object->text() ); logresult( 'FATAL', "failure found is " . Dumper $exception_object->text() ); }; sleep 20; } logcomment("Waiting $SLEEP sec to finish power cycle, filer up state"); sleep($SLEEP); #added by khanh: wait until Filer boot to CLI state, bring node up before reconnect my $curr_state = $transit_obj->get_state(); logcomment( "Filer state: " . $curr_state . ", required state is 'CLI'" ); if ( $curr_state ne 'CLI' ) { $curr_state = $transit_obj->change_state( to => "CLI", timeout => 900 ); logcomment( "Filer state: " . $curr_state ); } } #Trying to reconnect if the session has expired $Local_Node_Object->refresh_command_interface(); $ApiSetObj->api_reconnect_session( max_reconnect => 5, max_reconnect_timewait => 240 ); logcomment('Done with connections attempt'); logcomment('Get list of disks and offline volumes after reboot'); my $count = 0; RETRY: @drives_to_be = (); @drives_to_be = NACL::CS::StorageDisk->fetch( command_interface => $Local_Node_Object, 'requested_fields' => [ 'container-type', 'disk' ], ); my @spare_drive_after = (); my @data_drive_after = (); my @broken_drive_after = (); my @removed_drive_after = (); foreach (@drives_to_be) { if ( $_->container_type() =~ /spare/i ) { push( @spare_drive_after, $_->{"disk"} ); } elsif ( $_->container_type() =~ /aggregate/i ) { push( @data_drive_after, $_->{"disk"} ); } elsif ( $_->container_type() =~ /broken/i ) { push( @broken_drive_after, $_->{"disk"} ); } elsif ( $_->container_type() =~ /unassigned/i ) { push( @removed_drive_after, $_->{"disk"} ); } } logcomment("List of broken drives : @broken_drive_after"); logcomment("List of removed drives : @removed_drive_after"); logcomment("List of spare drives : @spare_drive_after"); logcomment("List of data drives : @data_drive_after"); my ( $spare_count_after, $data_count_after, $broken_count_after, $removed_count_after ); $spare_count_after = @spare_drive_after; $data_count_after = @data_drive_after; $broken_count_after = @broken_drive_after; $removed_count_after = @removed_drive_after; logcomment("spare drives are : $spare_count_after and data drives are : $data_count_after and broken : $broken_count_after and removed : $removed_count_after"); logcomment('Check storage consistency after the reboot'); my $failures = ''; my $flag = 0; if ( $spare_count_after != $spare_count_before ) { $failures .= 'Mismatch in spare drives before reboot: ' . "$spare_count_before\n" . 'and after reboot: ' . "$spare_count_after\n"; $flag = 1; } if ( $data_count_after != $data_count_before ) { $failures .= 'Mismatch in data drives before reboot: ' . "$data_count_before\n" . 'and after reboot: ' . "$data_count_after\n"; $flag = 1; } if ( $broken_count_after != $broken_count_before ) { $failures .= 'Mismatch in broken drives before reboot: ' . "$broken_count_before\n" . 'and after reboot: ' . "$broken_count_after\n"; $flag = 1; } if ( $removed_count_after != $removed_count_before ) { $failures .= 'Mismatch in removed drives before reboot: ' . "$removed_count_before\n" . 'and after reboot: ' . "$removed_count_after\n"; $flag = 1; } sleep 10; $count++; goto RETRY if ( ( $flag == 1 ) && ( $count <= 5 ) ); if ( $failures ne '' ) { if ($Failover_Flag) { NACL::STask::StorageFailover->enable( command_interface => $Local_Node_Object, node => $Local_Node_Object, ); } logresult( 'FATAL', "$failures" ); } logcomment("Check disk path name on both nodes - Nodes entered by user : @nodes"); my $status = 0; my $Node_Object_A = NACL::C::Node->new( name => $nodes[0] ); my $FILER_A = $Node_Object_A->name(); my $Node_Object_B = NACL::C::Node->new( name => $nodes[1] ); my $FILER_B = $Node_Object_B->name(); logcomment("************************************************************"); logcomment("** FILER A : $FILER_A | FILER B : $FILER_B ***"); logcomment("************************************************************"); logcomment("Check disk path on node $n"); my $Local_Node_Object = NACL::C::Node->new( name => $n ); $Local_Node_Object->refresh_command_interface(); my $Local_Node_Name = $Local_Node_Object->name(); my $Api_Set_Obj_P = $Local_Node_Object->get_7m_or_nodescope_apiset( connid => 'console' ); my @failed_drv_p; $Api_Set_Obj_P->execute_raw_command( 'command' => "\013" ); $Local_Node_Object->refresh_command_interface( max_reconnect => 5, max_reconnect_timewait => 240 ); my $out = $Api_Set_Obj_P->execute_raw_command( 'command' => "set -rows 0;storage disk show -fields diskpathnames" ); my $out = $Api_Set_Obj_P->execute_raw_command( 'command' => "set -rows 0;storage disk show -fields diskpathnames" ) if ( $out eq '' ); foreach my $line ( split /\n/, $out ) { my ( $drv, $path ); next if ( $line =~ /disk|^-|displayed|debug|entries|^$|session\.change|\[\S+\]/ ); if ( ( $line =~ /$FILER_A/ ) && ( $line =~ /$FILER_B/ ) ) { my ( $drv, $path ) = $line =~ /(\S+)\s+(\S+)/; logcomment("Drive $drv has path $path"); } else { my ( $drv, $path ) = $line =~ /\S+\:(\S+)\s+(\S+)/; logcomment("Drive $drv path is incomplete"); logcomment("**FATAL** : Drive $drv path is INCOMPLETE"); push( @failed_drv_p, $drv ); $status = 1; } } if ( ( $status == 1 ) && (@failed_drv_p) ) { logcomment("**FATAL** : : @failed_drv_p drives path is INCOMPLETE"); logresult( 'FATAL', "@failed_drv_p drives path is INCOMPLETE" ); } logcomment("Completed checking drive path on both nodes $Local_Node_Name"); logcomment("Wait for few minutes, before checking failed drives"); sleep(150); logcomment("Check for prefail or broken drives in aggr status"); my $Local_Node_Object = NACL::C::Node->new( name => $n ); $Local_Node_Object->refresh_command_interface(); my $FILER_C = $Local_Node_Object->name(); my $Api_Set_Obj_P = $Local_Node_Object->get_7m_or_nodescope_apiset( connid => 'console' ); $Api_Set_Obj_P->execute_raw_command( 'command' => " " ); my $aggr_out = $Api_Set_Obj_P->execute_raw_command( 'command' => "set -rows 0;run local aggr status -r" ); my $aggr_out = $Api_Set_Obj_P->execute_raw_command( 'command' => "set -rows 0;run local aggr status -r" ) if ( $aggr_out eq '' ); if ( $aggr_out =~ /prefail|Broken\s+disks|failed/ ) { logcomment("$FILER_C : FAILED DRIVE FOUND "); logcomment("$aggr_out"); $status = 1; } if ( $status == 1 ) { logcomment("**FATAL** : FAILED Drive Found"); logresult( 'FATAL', "FAILED Drive Found" ); } logcomment("Check for failed drives in sysconfig "); my $Local_Node_Object = NACL::C::Node->new( name => $n ); $Local_Node_Object->refresh_command_interface(); my $FILER_C = $Local_Node_Object->name(); my $Api_Set_Obj_P = $Local_Node_Object->get_7m_or_nodescope_apiset( connid => 'console' ); logcomment("$FILER_C : Checking for failed drive"); $Api_Set_Obj_P->execute_raw_command( 'command' => " " ); my $Sys_out = $Api_Set_Obj_P->execute_raw_command( 'command' => "set -rows 0;run local sysconfig -v" ); my $Sys_out = $Api_Set_Obj_P->execute_raw_command( 'command' => "set -rows 0;run local sysconfig -v" ) if ( $Sys_out eq '' ); if ( $Sys_out =~ /fail|failed|prefail/ ) { logcomment("$FILER_C : FAILED DRIVE FOUND "); logcomment("SYSCONFIG : $Sys_out"); $status = 1; } if ( $status == 1 ) { logcomment("**FATAL** : FAILED Drive Found"); logresult( 'FATAL', "FAILED Drive Found" ); } logcomment("Check for Link Status "); my $Local_Node_Object = NACL::C::Node->new( name => $n ); $Local_Node_Object->refresh_command_interface(); my $FILER_C = $Local_Node_Object->name(); my $Api_Set_Obj_P = $Local_Node_Object->get_7m_or_nodescope_apiset( connid => 'console' ); $Api_Set_Obj_P->execute_raw_command( 'command' => " " ); my $out = $Api_Set_Obj_P->execute_raw_command( 'command' => "set -rows 0;run local storage show psm" ); my @link_dis_drivs; foreach my $line ( split /\n/, $out ) { next if ( $line =~ /disk|^-|displayed|debug|entries|^$|session\.change/ ); if ( $line =~ /\[.*]/ ) { if ( $line =~ /DIS\/LNK|LNK|DIS/ ) { my ( $drv, $state, $lnk_lan, $max_lnk, $neg_lnk, $med_mx_lnk, $dev_mx_la, $Neg_max, $lan_wd ) = ( $line =~ /(\[.*\])\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/ ); logcomment("Drive ID : $drv State is $state !!!!!!!!! "); logcomment("Drive port link states are : Disk ID : $drv\n Port Max Link Speed:$lnk_lan\n Negotiated Max Link Speed:$max_lnk\n Medium Link Speed:$neg_lnk\n Device Max Link Speed:$med_mx_lnk\n Port Max Lane Width:$dev_mx_la\n Negotiated Max Lanewidth:$Neg_max\n Lanewidth:$lan_wd\n"); logcomment("Drive ID : $drv State is $state"); push( @link_dis_drivs, $drv ); } } } if (@link_dis_drivs) { logcomment("***FATAL*** : $FILER_C : Port State for following drives are DISABLED or OFFLINE, Drive ID(s) are: @link_dis_drivs"); logresult( "FATAL", "Port State for following drives are DISABLED or OFFLINE, Drive ID(s) are: @link_dis_drivs" ); } else { logcomment("All Links appears to be active"); } logcomment("$i Cycle passed"); } logcomment("Test passed on node $n"); if ($Failover_Flag) { NACL::STask::StorageFailover->enable( command_interface => $Local_Node_Object, node => $Local_Node_Object, ); } } logresult( 'PASS', 'Test successfully passed' );