#!/usr/local/bin/perl
# checks HSx series raid controllers for proper function
######
# make damn sure that only one instance of hszterm is running
# at any one time or you will likely domain panic and it's not my
# fault because I warned you here!
######
# you can likely add some support for this script to check that
# other versions of itself aren't already running to minimize this
# risk
# * requires the unsupported hszterm software available from the
# January 2000 Softw. Prod. Lib. CD (or older) from compaq.
# * also requires that hszterm be setuid root or perhaps run with
# sudo support.
# * check variables below for their appropriateness to your system
# tested on HSZ40, HSZ70, HSZ80, HSG80 controllers
# pat0523@Thedacare.org
#
# pat0523	03/19/2002	install
# pat0523	03/20/2002	make better use of $ENV, add int()
# pat0523	03/27/2002	check and open lock file, add $msgttl
# pat0523	03/28/2002	add stripeset check
# pat0523	05/13/2002	check for danger in battery section
# pat0523	01/07/2003	abort for too few lines returned by hszterm
#				which might indicate serious problems.
# jwe0224	02/14/2003	updated $disk for Tru64 v5.1A
# jwe0224	03/06/2003	updated $disk for Tru64 v5.1A patchkit 4
# pat0523	08/13/2003	give the red's priority in the output
# pat0523	10/23/2003	check for units in failedset
#
sub variables {
# path to the big brother network client
	$bb=$ENV{BB};
# our hostname
	$host=$ENV{MACHINE};
# name of page to create on BBHOST
# HPUX makes a "diskchk" raid page, AIX makes a "lvm" raid page, sheesh
	$page="diskchk";
# type of message to create on BBHOST, time to live of message in format
# +XX where XX=number of minutes
	$msgtyp="status";
	$msgttl="+1500";
# name or IP address of BBHOST
	$bbs=$ENV{BBDISP};
# path to a setuid hszterm :-(
	$hszterm="/usr/users/bb/hszterm";
# select a disk on the HSx with an unused partition, perhaps using
# output from /sbin/showfdmn <domainname> on one of your domains
# future to-do: make use of the the bus, target and LUN options
# that compaq recommends rather than the device name which they don't
###
# important
###
# select a partition that lsm, advfs, ufs, or swap is not using,
#   d is often a likely choice;  check disktab to know for sure
#
#	$disk="/dev/rz9d";            #Tru64 4.0G
#	$disk="/dev/disk/dsk1d";      #Tru64 5.1A SCSI-2
#	$disk="/dev/cport/scp0";      #Tru64 5.1x SCSI-3 CCL
	$disk="/dev/cport/scp0";
# does this controller serve stripe sets, otherwise skip check. 1=run check
	$stripe=0;
# title
	$title="HSx RAID controller status:";
# enable extra debugging information in BBOUT 1=yes
	$debug=0;
# path to lsof
	$lsofpath="/usr/bin/lsof";
# miscellaneous items to be init'd, not usually changed
	$hsx="Undefined HSx controller";
	$RAID="";
	$section="";
	$redundant=0;
	$message="";
	$bblog="";
	$date=localtime();
	$msgindex=0;
	$pathname=$ENV{BBTMP}."/bbhsx.tmp";
	$mycolor="green";
}
sub compilelog {
#
# compare the color this check is reporting and upgrade the overall color
# if it is higher
#
	if (defined($color[$msgindex])==1) {
	if ($color[$msgindex] eq "red" ) {
		$mycolor="red";
		}
	elsif ($color[$msgindex] eq "yellow" && $mycolor ne "red" ) {
		$mycolor="yellow";
		}
	}
	$bblog=$msgtyp.$msgttl." ".$host.".".$page." ".$mycolor." ".$date." $title\n$hsx\n\n";
#
# message index creates a unique line for each check
#
	$msgindex++;
}
sub thisfull {
# does a show this full of the default controller
# determined if it is in redundant mode and sets a flag for otherfull()
# checks the cache and cache battery status 
#
	my $line="";
	my $tmpHSZ="";
	my $state=0;
	$section="";
	@tmpHSZ=`$hszterm -f $disk "show this full"`;
	if ( $#tmpHSZ < 7 ) {
		die("@tmpHSZ\n\nAbort: hszterm returned too few lines");
		}
	foreach (@tmpHSZ) {
		chomp();
		if ($_ =~ "^Controller:" ) {
			$section="controller";
			next;
			}
		elsif ($_ =~ "^Cache:") {
			$section="cache";
			next;
			}
		elsif ($_ =~ "^Mirrored Cache:") {
			$section="mirrored cache";
			next;
			}
		elsif ($_ =~ "^Battery:") {
			$section="battery";
			$state=1;
			my $battmp="";
			next;
			}
		elsif ($_ =~ "^Extended information:") {
			$section="extended information";
			next;
			}
		if ($section eq "controller" && $_ =~ "HS" ) {
			$hsx=$_;
			next;
			}
		elsif ($section eq "controller" && $_ =~ "In dual-redundant" ) {
			$redundant=1;
			next;
			}
#
# this sad hack attempts to handle batteries in HSG firmware
# the battery flag is set to yes when a Battery: section is seen in the
# show output, the information is assembled into the $battmp variable
# and examined when a new section is detected.  if the string FULLY 
# CHARGED is present then we will assume all is OK, otherwise set the yellow
# flag. this makes no attempt to deal with other battery related warning 
# messages
#
		if ($state==1 && $section eq "battery" ) {
			$battmp=$battmp.$_;
			if ($debug==1) { print("$section\t$_\n"); }
			next;
			}
		elsif ($state==1 && $section ne "battery" ) {
			if ($battmp =~ /FULLY CHARGED/m ) {
				$message[$msgindex]="&green Battery OK\n";
				compilelog;
				}
			elsif ($battmp =~ /DANGER/m ) {
				$color[$msgindex]="red";
				$message[$msgindex]="&red Check battery status.\n";
				compilelog;
				}
			else {
				$message[$msgindex]="&yellow Check battery status.\n";
				$color[$msgindex]="yellow";
				compilelog;
				}
			$state=0;
			}
		elsif ($_ =~ "Battery is") {
#
# this battery section only works on older HSZ controllers?
#
			@line=split("Battery is ",$_);
			if ($line[1] =~ "GOOD" ) {
				$message[$msgindex]="&green ".$section." battery is good.\n";
				compilelog;
				}
			else {
				$message[$msgindex]="&yellow Check ".$section." battery status\n";
				$color[$msgindex]="yellow";
				compilelog;
				}
			}
		elsif ($_ =~ "Cache is") {
			@line=split("Cache is ",$_);
			if ($line[1] =~ "GOOD" ) {
				$message[$msgindex]="&green $section is good.\n";
				compilelog;
				}
			else {
				$message[$msgindex]="&yellow Check $section cache status\n";
				$color[$msgindex]="yellow";
				compilelog;
				}
			}
		if ($debug==1) { print("$section\t$_\n"); }
		}
}
sub otherfull {
# does a show this full of the other controller if present
#
	my $line="";
	my $state=0;
	my $tmpHSZ="";
	$section="";
	@tmpHSZ=`$hszterm -f $disk "show other full"`;
	if ( $#tmpHSZ < 7 ) {
		die("@tmpHSZ\n\nAbort: hszterm returned too few lines");
		}
	foreach (@tmpHSZ) {
		chomp();
		if ($_ =~ "^Controller:" ) {
			$section="other controller";
			next;
			}
		elsif ($_ =~ "^Cache:") {
			$section="other cache";
			next;
			}
		elsif ($_ =~ "^Mirrored Cache:") {
			$section="other mirrored cache";
			next;
			}
		elsif ($_ =~ "^Battery:") {
			$section="other battery";
			$state=1;
			my $battmp="";
			next;
			}
		elsif ($_ =~ "^Extended information:") {
			$section="other extended information";
			next;
			}
#
# this sad hack attempts to handle batteries in HSG firmware
# the battery flag is set to yes when a Battery: section is seen in the
# show output, the information is assembled into the $battmp variable
# and examined when a new section is detected.  if the string FULLY 
# CHARGED is present then we will assume all is OK, otherwise set the yellow
# flag. this makes no attempt to deal with other battery related warning 
# messages
#
		if ($state==1 && $section eq "other battery" ) {
			$battmp=$battmp.$_;
			if ($debug==1) { print("$section\t$_\n"); }
			next;
			}
		elsif ($state==1 && $section ne "other battery" ) {
			if ($battmp =~ /FULLY CHARGED/m ) {
				$message[$msgindex]="&green other battery OK\n";
				compilelog;
				}
                        elsif ($battmp =~ /DANGER/m ) {
                                $color[$msgindex]="red";
                                $message[$msgindex]="&red Check battery status.\n";
                                compilelog;
                                }
			else {
				$message[$msgindex]="&yellow check other battery status.\n";
				$color[$msgindex]="yellow";
				compilelog;
				}
			$state=0;
			}
		elsif ($_ =~ "Battery is") {
#
# this battery section only works on older HSZ controllers?
#
			@line=split("Battery is ",$_);
			if ($line[1] =~ "GOOD" ) {
				$message[$msgindex]="&green ".$section." battery is good.\n";
				compilelog;
				}
			else {
				$message[$msgindex]="&yellow Check ".$section." battery status\n";
				$color[$msgindex]="yellow";
				compilelog;
				}
			}
		elsif ($_ =~ "Cache is") {
			@line=split("Cache is ",$_);
			if ($line[1] =~ "GOOD" ) {
				$message[$msgindex]="&green $section is good.\n";
				compilelog;
				}
			else {
				$message[$msgindex]="&yellow Check $section cache status\n";
				$color[$msgindex]="yellow";
				compilelog;
				}
			}
		if ($debug==1) { print("$section\t$_\n"); }
		}
}
sub showraid {
#
# place show raid in a variable so we can have a generic
# subroutine to parse it with mirror/stripe etc.
	@RAID=`$hszterm -f $disk "show raid full"`;
	$section="raidset";
	chomp(@RAID);
	}
sub showmirror {
#
# place show mirror in a variable so we can have a generic
# subroutine to parse it with mirror/stripe etc.
	@RAID=`$hszterm -f $disk "show mirror full"`;
	$section="mirrorset";
	chomp(@RAID);
	}
sub showstripe {
#
# place show stripe in a variable so we can have a generic
# subroutine to parse it with mirror/stripe etc.
	@RAID=`$hszterm -f $disk "show stripe full"`;
	$section="stripeset";
	chomp(@RAID);
	}
sub showfailed {
#
# place show raid in a variable so we can have a generic
# subroutine to parse it with mirror/stripe etc.
	@RAID=`$hszterm -f $disk "show failed"`;
	$section="failedset";
	chomp(@RAID);
	my $line="";
	if ( $#RAID < 2 ) {
		die("@RAID\n\nAbort $section: hszterm returned too few lines");
		}
	foreach (@RAID) {
		if ($_ =~ "FAILEDSET" ) {
		if ($_ =~ "DISK") {
			$message[$msgindex]="&red $_\n";
			$color[$msgindex]="red";
			compilelog;
			}
		else {
			$message[$msgindex]="&green No disks in failedset\n";
			compilelog;
			}
		}
		}
}
sub parseraid {
#
# look through the output of show ___ full and make sure the 
# overall status is are NORMAL
# makes no effort to diagnose to an individual disk, that
# is why you make the big bucks.
#
	my $unitname="";
	my $line="";
	my $state=0;
	if ( $#RAID < 2 ) {
		die("@RAID\n\nAbort $section: hszterm returned too few lines");
		}
	foreach (@RAID) {
	if ($_ =~ $section ) {
		@line=split(" ",$_);
		$unitname=$line[0];
		$state=0;
		}
	elsif ($_ =~ "State:" ) {
		$state=1;
		next;
		}
	if ($state==1) {
		if ($_ =~ "NORMAL" ) {
			$message[$msgindex]="&green $section $unitname $_\n";
			compilelog;
			}
		else {
			$message[$msgindex]="&red $section $unitname $_\n";
			$color[$msgindex]="red";
			compilelog;
			}
		$state=2;
		}
	}
}
sub moredebug {
#
# items to check that our BB environment is coming through
#
	warn("\nBB Environment:\n");
	warn($ENV{BBHOME});
	warn($ENV{BB});
	warn($ENV{MACHINE});
	warn($ENV{BBTMP});
	warn($ENV{BBDISP});
}
sub chkopen {
# this is intended to make sure that big brother never steps on itself
# opening two bb-hsx jobs at once.
	if (-e $lsofpath ) {
	if ((`$lsofpath -w $pathname`) ne "" ) {
		die("$pathname open, script already running");
		}
	open(LCKFIL,">$pathname") or die("Can't open lockfile");
	}
	else {
		die("Can't open lsof");
		}
}
sub sendlog {
#
# create a status update by combining the initial message with the various
# updates from the individual tests, then send it out.
#
	my $msg="";
	foreach $_(@message) {
		if ($_ =~ /\&red/m ) {
			$msg=$msg.$_."\r\n";
			}
		}
	foreach $_(@message) {
		$msg=$msg.$_;
		}
	$bblog=$bblog.$msg."\n";
	if ($debug==1) { warn("$bblog\n"); }
	system("$bb $bbs \"$bblog\"");
	}
###
# MAIN
###
variables;
if ($debug==1) { moredebug; }
chkopen;
thisfull;
otherfull;
showraid;
parseraid;
showmirror;
parseraid;
if ($stripe==1) {
	showstripe;
	parseraid;
	}
showfailed;
sendlog;
close(LCKFIL);
exit
