# Copyright (c) 2001-2013 NetApp, Inc., All Rights Reserved # Any use, modification, or distribution is prohibited # without prior written consent from NetApp, Inc. # ## @summary Non Disruptive Outage Module ## @author dl-nacl-dev@netapp.com ## @status shared ## @pod here =head1 NAME NACL::MTask::NDOChecker - Task to verify client data access during non disruptive operations happening on a filer. =head1 SYNOPSIS use NACL::MTask::NDOChecker; # Start the ndo checker my $ndocheck_obj = NACL::MTask::NDOChecker->start( client => $ci, volume => $volume, threshold => $threshold, %other_opts ); # Run the testcases which needs to be tested for client access # during non-disruptive operation ... # Stop NDO checker $ndocheck_obj->stop(); =head1 DESCRIPTION C provides methods related to client data access, while a non-disruptive operation is happening on the filer. Example of NDO: SFO,VolumeMove etc., =head1 ATTRIBUTES =head2 client (Required, isa 'NACL::C::Client') A command interface object which represents the client from which the library will access the filer's volume to perform I/O. See L. =head2 volume (Required, isa 'NACL::C::Volume') A volume object which will be used for I/O from the client. The volume is normally part of non-disruptive operation as well. Example: a volume being moved or a volume belonging to a node being taken over etc., See L. =head2 threshold (Required) This indicates the duration in seconds during which the volume could be inaccessible from the client, while data access is in progress. =head2 mount_obj (Optional,isa NACL::C::Client::Mount) A mount object of that of the volume, if the volume is mounted already in the client. =head2 file_pattern (Optional) Substring of file which will be created for I/o in the client by NDOChecker =cut package NACL::MTask::NDOChecker; use strict; use warnings; use base qw(NACL::MTask::MTask); use Tharn qw(sleep); use NATE::Log qw(log_global); my $Log = log_global(); my $may_enter = $Log->may_enter(); my $may_exit = $Log->may_exit(); use NATE::Log qw(log_global); use Params::Validate qw(validate_with CODEREF OBJECT SCALAR ARRAYREF BOOLEAN); use NATE::BaseException qw(:try); use NATE::Exceptions::Argument qw(:try); use NACL::APISet::Exceptions::InvalidParamException qw(:try); use NACL::ComponentUtils qw(Dumper); use NACL::STask::Client::Mount; use NACL::APISet; use NATE::Process; use NACL::GeneralUtils qw(random_name_generator); use NACL::C::VolumeFile; =head1 METHODS =head2 new my $ndocheck_obj = NACL::MTask::NDOChecker->new ( client => $ci, volume => $volume, threshold => $threshold, mount_obj => $vol_mount_obj, file_pattern => $file_pattern ); or my $ndocheck_obj = NACL::MTask::NDOChecker->new ( client => $ci, volume => $volume, threshold => $threshold ); Return a new NACL::MTask::NDOChecker object with the given attributes. =cut use Class::MethodMaker [ new => [ '-hash', '-init', 'new' ], scalar => [ { '-type' => 'NACL::C::Client' }, 'client' ], scalar => [ { '-type' => 'NACL::C::Volume' }, 'volume' ], scalar => 'threshold', scalar => '_id', hash => '_proc_objects', scalar => [ { '-type' => 'NACL::APISet' }, '_client_api' ], scalar => [ { '-type' => 'NACL::C::Client::Mount' }, 'mount_obj' ], scalar => '_user_mount_obj', scalar => [ { -default => 'NACL_NDO_CHECKER' }, 'file_pattern' ] ]; sub init { log_global->enter(); my $self = shift; validate_with( params => \@_, spec => { client => { type => OBJECT, isa => 'NACL::C::Client' }, volume => { type => OBJECT, isa => 'NACL::C::Volume' }, threshold => { type => SCALAR }, file_pattern => { type => SCALAR, optional => 1 } } ); my $ci = $self->client(); my $set = $ci->type(); my $api_set; if ( $set =~ /unix-linux/ ) { $api_set = 'Linux'; } else { log_global->exit(); NACL::APISet::Exceptions::InvalidParamException->throw( text => "Client " . "of type $set are not yet supported in the library " ); } my $api = $ci->apiset( category => 'Host', set => $api_set, interface => 'CLI' ); ## store the api object in object $self->_client_api($api); log_global->exit(); } ## end sub init =head2 start Static Call my $ndocheck_obj = NACL::MTask::NDOChecker->start( client=> client_command_interface, volume => volume_object, threshold => $threshold, load_type => 'RW' ); Static Call (with all other possible options) my $ndocheck_obj = NACL::MTask::NDOChecker->start( client=> client_command_interface, volume => volume_object, threshold => $threshold, load_type => 'RW', file => volumefile_instance, file_pattern => 'NDO_CHECK_PATH', mount_obj => $vol_mount_obj ); Instance call $ndochecker_obj->start(load_type => 'RW', file => volumefile_instance, mount_obj => $vol_mount_obj ); (Class or Instance method) The method starts the ndochecker on the volume specified. This is a non-blocking call, as this returns soon after starting the checker in the client. The method returns a NACL::MTask::NDOChecker object immediately. Normally the checker continues to run in the volume till $ndochecker_obj->stop() is invoked on the object. In case there is an outage in the client and I/O takes longer than the threshold mentioned, then the I/O tool stops running; However the caller gets to know about this failure only while invoking stop() on the object. An exception containing the details of the failure is thrown, while invoking the stop() method in this case. =head3 Options =over =item C<< client => $ci >> (Required for class method, Optional for instance method) The component object representing the client on which the checker needs to be started. =item C<< volume => $ci >> (Required for class method, Optional for instance method) Volume Component/STask object which represents the volume on which the ndo checker is run. =item C<< threshold => $threshold >> (Required for class method, Optional for instance method) This indicates the expected duration in seconds for which the filer could be down/inaccessible, while a non disruptive operation is in progress. =item C<< file => $file >> (Optional) NACL::C::VolumeFile instance for the file used for I/O, in the start() method =item C<< mount_obj => $mount_obj >> (Optional) If the caller has already mounted the volume, in the client, then mount_obj of type NACL::C::Client::Mount could be passed. Instead of mounting the volume, the object will be re-used for doing I/O. If the object is not passed, then a mount is performed and will be unmounted during invocation of stop() method. =item C<< file_pattern => $file_pattern >> (Optional) Substring/pattern used in naming, while creating directory path for performing I/O in the volume. This is useful to make sure the files are easy to recognize from another process that may potentially do something disruptive to the file. Examples of this include a background unlink/delete load or something that might concurrently modify it. Default is 'NACL_NDO_CHECKER'. =item C<< load_type => $load >> (Optional) Indicates the load type - RW or RO =back =cut sub start { log_global->enter(); my $pkg_or_obj = shift; log_global->debug( "Opts to 'start':\n" . Dumper( \@_ ) ) if ( log_global->may_debug() ); my %opts = validate_with( params => \@_, spec => { client => { type => OBJECT, isa => 'NACL::C::Client', optional => 1 }, volume => { type => OBJECT, isa => 'NACL::C::Volume', optional => 1 }, threshold => { type => SCALAR, optional => 1 }, load_type => { type => SCALAR, optional => 1 }, file => { type => OBJECT, isa => 'NACL::C::VolumeFile', optional => 1 }, file_pattern => { type => SCALAR, optional => 1, }, mount_obj => { type => OBJECT, isa => 'NACL::C::Client::Mount', optional => 1 } } ); %opts = $pkg_or_obj->_process_inputs(%opts); my $client_command_interface = delete $opts{client}; my $volume_instance = delete $opts{volume}; my $threshold_in_seconds = delete $opts{threshold}; my $load_type = delete $opts{load_type}; my $file = delete $opts{file} if defined( $opts{file} ); my $file_pattern = delete $opts{file_pattern}; my $input_mount_object = delete $opts{mount_obj} if defined( $opts{mount_obj} ); my $hostrec = $client_command_interface->hostrec(); my $hostspec = $hostrec->id(); my $self; unless ( defined $file_pattern ) { $file_pattern = 'NACL_NDO_CHECKER'; } unless ( defined $load_type ) { $load_type = 'RW'; } unless ( ref $pkg_or_obj ) { # Class method call, so create an instance $self = NACL::MTask::NDOChecker->new( client => $client_command_interface, volume => $volume_instance, threshold => $threshold_in_seconds, file_pattern => $file_pattern ); } else { # Instance method call. Need not create a new instance $self = $pkg_or_obj; } my $id = random_name_generator( prefix => "id", size => 1 ); $self->_id($id); my ($mount_obj, $mount_dir, $io_path, $io_tool_path, $run_opts, %proc_obj_details, $vol_name, $file_name, $api ); $vol_name = $volume_instance->volume(); ## Mount the volume using the mount stask ## If the user has passed an already mounted path, that is used $mount_obj = $input_mount_object; if ( defined $mount_obj ) { $self->_user_mount_obj(1); } else { $mount_obj = $self->_perform_vol_mount(); $self->_user_mount_obj(0); } ## Set the mount_obj attribute of the object $self->mount_obj($mount_obj); $api = $self->_client_api(); ## Create a directory inside the mounted path to run I/O $mount_dir = $mount_obj->mount_point(); if ( defined $file ) { ## Start from the mount point to reach the file having ## been passed. $io_path = $mount_dir; ## Get the path of the containing directory from the volumefile instance my $dir_path = $file->get_containing_directory_path(); # In the path, extract the filepath starting from position # where volume name ends my $start = index( $dir_path, $vol_name ) + length($vol_name) + 1; my $file_path = substr $dir_path, $start, length $dir_path; ## Now $file_path contains the path from position after volname till ## end of the directory path ## If its not defined it means that file is directly available under the volume if ($file_path) { $io_path .= '/' . $file_path; } ## Now get the name of the file from the VolumeFile Instance my $path = $file->path(); my @elems = split '/', $path; $file_name = $elems[$#elems]; log_global->debug("This is io_path $io_path "); log_global->debug("This is filename $file_name "); if($load_type eq 'RW') { $api->chmod( mode => '777', paths => $io_path . '/' . $file_name, 'privilege-level' => 'root', ); } } else { $io_path = $mount_dir . $self->_get_io_path() . "/_" . $id; $self->_mkdir_for_io($io_path); } ## Get runtime opts $run_opts = $self->_get_run_opts( $load_type, $io_path, $file_name ); ## Code string that would be used to launch dt. dt runs in the ## background untill stop is invoked on the object or an ## I/O error happens. When any I/O error happens then dt throws ## NATE::BaseException and exits. The exception is saved and ## is thrown to the caller when stop() is invoked on the object. my $codestring = '{ use NATE::Log qw(log_global); use NATE::ParamSet; use NATE::BaseException qw(:try); use Data::Dumper qw(Dumper) ; my ($opts,$io_path) = @_; my $client_dt_proc = NATE::Process->new( codespec => "/usr/software/test/bin/dt.stable", args => [$opts], runid => "dt_run", workdir => $io_path, onexit => sub { my ($proc) = @_; log_global->debug (sub { "Executing exit handler P3" } ); # Pass the childs results to the parent. my $worst_result = $proc->exit_status(); log_global->comment( "This is the Exit Status $worst_result" ); if ( $worst_result != 0 ) { NATE::BaseException->throw( "dt tool returned non zero exit code.Please verify logs \n"); } ## if #worst ends here } ## sub ends here ); while (1) { $client_dt_proc->start(); $client_dt_proc->wait(); } ##while ends $client_dt_proc->stop(); $client_dt_proc->destroy(); }'; ##code string ends my ( $runid, $logdir, $logfile_p, $run_dt_proc ); $runid = $vol_name . "_" . $id; $logdir = NATE::ParamSet->param_global->get('TOP_LOGDIR') . "/" . NATE::ParamSet->param_global->get('RUNID'); $logfile_p = $logdir . "/" . $runid; log_global->log("logfile path $logfile_p"); $run_dt_proc = NATE::Process->new( codespec => $codestring, runid => $runid, client => $hostspec, args => [ $run_opts, $io_path ], onexit => sub { my ($proc) = @_; log_global->debug( sub {"Executing exit handler P2"} ); # Pass the child's results to the parent. my $worst_result = $proc->worst_result(); if ($worst_result) { $worst_result->log(); } } ); ## The object data _proc_objects is a hash which holds the handle ## of the process launched for each volume $proc_obj_details{$vol_name} = [ $run_dt_proc, $logfile_p, $runid ]; $self->_proc_objects(%proc_obj_details); #Run Io Tool $self->_run_io_tool(); log_global->exit(); return $self; } ## end sub start =head2 stop $ndocheck_obj->stop(); (Instance Method) Method to stop the checker started using the start() method. If the volume was mounted via the package then it would be unmounted. If the volume remained inaccessible for more than threshold duration then an exception will be reported containing the details of the failure in this method. =cut sub stop { log_global->enter(); my $self = shift; my $volume = $self->volume->volume; my $run_dt_proc; my %all_procs = $self->_proc_objects(); $run_dt_proc = $all_procs{$volume}->[0]; ## Invoke a stop on the process used to launch dt ## This will in turn stop the dt process if it is still running $run_dt_proc->stop(); ## wait for 2 secs for the procs to end Tharn::sleep 2; if ( ( $self->_user_mount_obj() ) == 0 ) { ## Now Unmount the volume my $mount = $self->mount_obj(); my $try_count = 0; AGAIN: try { $try_count++; $mount->purge(); } catch NATE::BaseException with { if ( $try_count <= 3 ) { goto AGAIN; } log_global->warn("Failed to unmount the volume $volume "); my $e = shift; log_global->comment( "The error message is " . $e->text() ); }; } my $logfile_p = $all_procs{$volume}->[1]; ## Capture the worst result from the launched process my $worst_result = $run_dt_proc->worst_result(); ##destroying the process here. If there are errors ##exception will be thrown in the next block. $run_dt_proc->destroy(); ## Handle the worst result.If there is I/O error throw ## exception to the caller. if ($worst_result) { log_global->debug( sub { "This is the worst result " . Dumper($worst_result) } ); my $type = $worst_result->type(); if ( $type =~ /^FAIL|FATAL|SCRIPT$/ ) { log_global->comment(" dt has reported I/O error. "); } } ## if $worst ends here ## Check II $self->_guess_failures($logfile_p); log_global->exit(); } ## end sub stop ################################################################################################### # HELPER FUNCTIONS ################################################################################################### sub _guess_failures { log_global->enter(); my ( $self, $logfile_p ) = @_; my $file1 = $logfile_p . '/dt_run.log'; my $dt_out = NATE::Inc::get_file($file1); my @data = split "\n", $dt_out; my $failure_count = 0; my %clue_list = (); my $duration = undef; my $noprogress = undef; foreach my $data (@data) { ## Non zero values are set in the @clue_list ## everytime a strong clue exists in the log file ## that I/O delay happend due to a possible filer outage if ( $data =~ /No progress made for .* for \d+ seconds!/ ) { ## This indicates I/O delay - read/write/noprogress ## Not a sure indicator of filer outage. ## Just increment the failure count $failure_count++; } elsif ( $data =~ /This requests\' elapsed time of (\d+), has exceeded the noprogtt of \d+ seconds!/ ) { ## Strong Indicator of filer outage : ONE ## Trigger should have been executed at 'elapsed time' beyond the threshold duration ## store the value in the $duration, ## set non zero value in clue_list ## increment failure count $duration = $1; log_global->comment("This is duration $duration in loop"); $clue_list{'NOPROG_DURATION_ISSET'} = 1; $failure_count++; } elsif ( $data =~ /Executing\:.*?dt_noprog_script\.ksh.*?noprog.*/ ) { ## Strong Indicator of filer outage : TWO ## Indicates that the no progress trigger script has been executed, ## set non zero value in clue list and ## increment failure count $clue_list{'TRIGGER_EXECUTED'} = 1; $failure_count++; } elsif ( $data =~ /The no progress time is (\d+)/ ) { if ( $1 > 0 ) { ## Strong Indicator of filer outage : THREE ## The noprogress time has been reported by the trigger script ## Set non zero value in clue list ## Increment failure count $clue_list{'NOPROG_TIME_ISREPORTED'} = 1; $failure_count++; } else { ## If no progress time reported is not greater than zero ## then set $noprogess is 0 $noprogress = 0; } } elsif ( $data =~ /I\/O has exceeded the limit \(noprogt\)/ ) { ## Strong Indicator of filer outage : FOUR ## This line suggests a failure due to noprogress in I/O ## Set non zero value in clue list ## Increment failure count $clue_list{'IO_LIMIT_EXCEEDED'} = 1; $failure_count++; } elsif ( $data =~ /Total errors detected\: (\d+)\/\d+/ ) { ## Presence of this line suggests error in I/O. ## Not a definitive indicator of noprogress due to Disruptive operation ## If the tool reported non zero errors, then increment failure count my $error = $1; if ( $1 > 0 ) { $failure_count++; } } elsif ( $data =~ /Terminating with status code -1/ ) { ## Presence of this line suggests error in I/O. ## Not a definitive indicator of noprogress due to Disruptive operation ## increment failure count $failure_count++; } elsif ( $data =~ /Exiting with status code -1/ ) { ## Presence of this line suggests error in I/O. ## Not a definitive indicator of noprogress due to Disruptive operation ## increment failure count $failure_count++; } elsif ( ## Presence of this line suggests error in I/O. ## Not a definitive indicator of noprogress due to Disruptive operation ## increment failure count $data =~ /\* SCRIPT ERROR\: Subtest exited with status 255/ ) { $failure_count++; } } ##foreach ends here log_global->comment("Failure count $failure_count"); if ( $failure_count > 0 ) { my $aggr = 1; my @keys = qw(NOPROG_DURATION_ISSET TRIGGER_EXECUTED NOPROG_TIME_ISREPORTED IO_LIMIT_EXCEEDED); #keys %clue_list; foreach my $key (@keys) { $aggr &&= $clue_list{$key}; } my $error_msg; my $log_det = " Verify following logfiles:\n $logfile_p\.log and \n $logfile_p" . "\/dt_run.log \n for details and confirm \n"; if ( defined $aggr && $aggr == 1 ) { ## This is definitely due to some disruptive operation ## no progress in dt io and hence a script result was reported $error_msg = "Filer access failure has occured .\n"; $error_msg .= "The errors suggest filer outage\n"; } elsif ( defined $duration ) { ## This means that noprogress was hit when dt was running. ## This would most probably because of the disruptive operation that happened $error_msg = "No progress in I/O for $duration seconds \n"; $error_msg .= "The errors suggest filer outage\n"; } else { ## This just suggests that there was some read/write error ## And the dt exited with non zero code. ## It may be because of disruptive operation (most likely not. in that ## case it would have hit the previous loop) $error_msg = "I/O errors occured when checker was running. \n"; ## based on the flags below, inform the user that this ## is most likely not because of the disruptive operation if ( ( $noprogress == 0 ) && ( !defined $duration ) ) { $error_msg .= "The errors donot suggest filer outage \n"; } } log_global->exit(); NATE::BaseException->throw( $error_msg . $log_det ); } } ##sub _guess_failure ends sub _run_io_tool { log_global->enter(); my $self = shift; my $volume_name = $self->volume->volume(); my %all_procs = $self->_proc_objects(); my $launch_proc = $all_procs{$volume_name}->[0]; ## start the process now. It will run in background $launch_proc->start(); log_global->exit(); } ## end sub _run_io_tool sub _get_run_opts { log_global->enter(); my ( $self, $load_type, $io_path, $file ) = @_; my $threshold_in_seconds = $self->threshold; my $run_opts; ## $noprogt is the noprogresstime noprogt that will be sent to dt ## This will be set to user passed threshold - 10; so that a ## the trigger could be executed exactly at noprogtt, which is ## really the threshold_in_seconds my $noprogt; if($threshold_in_seconds > 10 ) { $noprogt = $threshold_in_seconds - 10;} else { $noprogt = 1; } if ( $load_type eq 'RW' ) { $run_opts = " enable=noprog" . " alarm=2s" . " flags=direct" . " noprogt=$noprogt" . " noprogtt=$threshold_in_seconds" . " limit=5m" . " enable=debug" . " trigger=cmd:/x/eng/localtest/noarch/bin/dt_noprog_script.ksh"; unless ( defined $file ) { $file = 'ndo_file'; } my $full_p = $io_path . '/' . $file; $run_opts .= " of=$full_p"; } # $load_type eq 'RW' if ( $load_type eq 'RO' ) { $run_opts = " enable=noprog" . " disable=compare" . " alarm=2s" . " flags=direct" . " noprogt=$noprogt" . " noprogtt=$threshold_in_seconds" . " limit=5m" . " trigger=cmd:/x/eng/localtest/noarch/bin/dt_noprog_script.ksh"; unless ( defined $file ) { $file = 'ndo_file'; } my $full_p = $io_path . '/' . $file; $run_opts .= " if=$full_p"; } # $load_type eq 'RO' log_global->exit(); return $run_opts; } ## end sub _get_run_opts ## The method just returns the path which could be used for io ## Later this path is created under the junction(mount point) ## and IO is done. sub _get_io_path { log_global->enter(); my $self = shift; my $file_pattern = $self->file_pattern(); my $name = $self->client->name(); my $io_path = "/$file_pattern/$$/$name"; log_global->exit(); return $io_path; } ## end sub _get_io_path ## Create the io path under the junction point sub _mkdir_for_io { log_global->enter(); my ( $self, $io_path ) = @_; ## store the api object in object my $api = $self->_client_api(); $api->mkdir( 'make-parents' => '1', 'paths' => $io_path ); $api->chmod( mode => '777', recursive => 1, paths => $io_path, 'privilege-level' => 'root', ); log_global->exit(); } ## end sub _mkdir_for_io ## Mounts the volume sub _perform_vol_mount { log_global->enter(); my $self = shift; my $client_command_interface = $self->client(); my $volume_instance = $self->volume(); ## Manage the volume mount here ## Mount the volume using the mount stask. my $mount_obj = NACL::STask::Client::Mount->create( command_interface => $client_command_interface, volume => $volume_instance, nacltask_if_exists => 'reuse', exclusive => 1 ); my $api = $self->_client_api(); my $mount_point = $mount_obj->mount_point(); $api->chmod( mode => '777', paths => $mount_point, 'privilege-level' => 'root', ); log_global->exit(); return $mount_obj; } ## end sub _perform_vol_mount sub _process_inputs { log_global->enter(); my ( $pkg_or_obj, %opts ) = @_; if ( ref $pkg_or_obj ) { # Instance method call. We should get client,volume, # threshold file_pattern from the object itself. foreach my $key (qw(client volume threshold file_pattern mount_obj)) { if ( defined $opts{$key} ) { log_global->exit(); NATE::Exceptions::Argument->throw( "$key is already an attribute of this object " . "and must not be specified to NACL::MTask::NDOChecker::start" ); } } foreach my $key (qw(client volume threshold file_pattern mount_obj)) { # Populate the input hash my $key_isset = "${key}_isset"; if ( $pkg_or_obj->$key_isset() ) { $opts{$key} = $pkg_or_obj->$key(); } } } else { # static method call foreach my $key (qw(client volume threshold )) { unless ( defined $opts{$key} ) { log_global->exit(); NATE::Exceptions::Argument->throw( "Mandatory parameter $key missing in call " . "to NACL::MTask::NDOChecker::start" ); } } } log_global->exit(); return %opts; } ## end sub _process_inputs 1;