#! /bin/sh
 
#    Jun 16/14 - F.Majaess
 
#id  mntrjhm - Monitoring script for JHOME associated job.
 
#    AUTHOR  - F.Majaess
 
#hd  PURPOSE - "mntrjhm" script is used to check on and report status of
#hd            JHOME associated back-end jobs.
#hd            Note: One of 
#hd                  "fl"/"ar"/"ars"/"hs"/"ns"/"sns"/"na"/"sna"/"pb"/"npb"/"rj" 
#hd                  option, (described below), can be specified at a time.
 
#pr  PARAMETERS:
#pr 
#pr    PRIMARY
#pr 
#pr      target = target IBM "ha/sp" cluster/node to do the check on.
#pr               (="local host" if invoked from one of the IBM
#pr                                 back-end(s),
#pr                ="hadar", otherwise)
#pr      user   = username - used only in conjunction with the "own"
#pr               parameter below (=$USER)
#pr      filter = a string to apply to "egrep" as a filter for the output
#pr 
#pr      fl     = switch to output full "[spica,hadar]_jhome_chk_log" file
#pr               contents of all scanned jhome based jobs. (default).
#pr      ar     = switch to output "[spica,hadar]_jhome_chk_log_autorestartable"
#pr               file contents of "autorestartable" jhome jobs.
#pr      ars    = switch to output "[spica,hadar]_jhome_chk_log_autoresub"
#pr               file contents of "autoresubmitted" jhome jobs.
#pr      hs     = switch to output massaged 
#pr                "[spica,hadar]_jhome_chk_log_autoresub_historical"
#pr               historical log file contents of "autoresubmitted" jhome jobs.
#pr 
#pr      ns     = switch to output "[spica,hadar]_jhome_chk_log_needsubmission"
#pr               file contents of jhome scanned jobs that need submission.
#pr      sns    = switch to list 
#pr                 "[spica,hadar]_jhome_chk_log_simplifiedneedsubmission"
#pr               file of simplified columns of jhome scanned jobs that 
#pr               need submission.
#pr      na     = switch to output "[spica,hadar]_jhome_chk_log_needattention"
#pr               file contents of jhome scanned jobs that need attention.
#pr      sna    = switch to list 
#pr                 "[spica,hadar]_jhome_chk_log_simplifiedneedattention"
#pr               file of simplified columns of jhome scanned jobs that 
#pr               need attention.
#pr      pb     = switch to output just inactive "post_block" jobs which need
#pr               attention from "[spica,hadar]_jhome_chk_log_needattention" 
#pr               file contents.
#pr      npb    = switch to output other than inactive "post_block" jobs which
#pr               need attention from "[spica,hadar]_jhome_chk_log_needattention" 
#pr               file contents.
#pr 
#pr      rj     = switch to request running "chkcjhm" script to identify
#pr               inactive restartable jobs which still need to run in "jhome"
#pr               subdirectory. 
#pr      dirlist= Valid only in combination with "rj" switch. 
#pr               It allows specifying optional list of full path of valid JHOME 
#pr               subdirectories on "target" back-end cluster to pass on to 
#pr               "chkcjhm" call.
#pr               (='/fs/dev/crb/??d0[1-3]/data/JHOME_*')
#pr               
#pr      nt     = number of times to perform the check 
#pr               (=100 for "rj" case;
#pr                =1   otherwise)
#pr      ri     = interval time between the checks in seconds (=300)
#pr 
#pr   SECONDARY
#pr 
#pr        own = switch to list only user's own jhome jobs
#pr              (=no/yes)
#pr 
 
#ex  EXAMPLE: 
#ex 
#ex    mntrjhm [fl] sp 
#ex 
#ex  The above will list status of all scanned jhome jobs by 
#ex  displaying "spica_jhome_chk_log" file contents. 
#ex 
#ex    mntrjhm ha ar
#ex 
#ex  List "hadar_jhome_chk_log_autorestartable" file contents 
#ex  of "autorestartable" jhome jobs on Hadar.
#ex 
#ex    mntrjhm sp ars
#ex 
#ex  List "spica_jhome_chk_log_autoresub" file contents 
#ex  of "autoresubmitted" jhome jobs on Spica.
#ex
#ex    mntrjhm ha hs ha own user=acrnpd1 filter='eig-001|eig-002'
#ex 
#ex  List massaged "hadar_jhome_chk_log_autoresub_historical" file 
#ex  entries, for "eig-001"/"eig-002" runs under "acrnpd1" account,
#ex  of "autoresubmitted" jhome jobs on Hadar.
#ex 
#ex    mntrjhm ha sp ns
#ex 
#ex  Output contents of "[spica,hadar]_jhome_chk_log_needsubmission" 
#ex  files of jhome scanned jobs identified in need of submission.
#ex 
#ex    mntrjhm ha sp na
#ex 
#ex  Output contents of "[spica,hadar]_jhome_chk_log_needattention" 
#ex  files of jhome scanned jobs identified in need of attention.
#ex 
#ex    mntrjhm ha sp na own
#ex 
#ex  List all invoking account jhome jobs, on Hadar and Spica,
#ex  requiring attention by filtering out the contents of
#ex  "[spica,hadar]_jhome_chk_log_needattention" files.
#ex 
#ex    mntrjhm sp pb own user=acrnxyz filter=' aba | abb ' 
#ex 
#ex  List filtered "post_block" jobs on Spica for "acrnxyz" account,
#ex  from "spica_jhome_chk_log_needattention" file, which are in
#ex  need of attention and are associated with "aba"/"abb" runs.
#ex 
#ex    mntrjhm ha npb own user=acrnxyz 
#ex 
#ex  List from "hadar_jhome_chk_log_needattention" file, entries 
#ex  associated with other than "post_block" jhome jobs on Hadar 
#ex  for "acrnxyz", which are identified in need of attention.
#ex 
#ex    mntrjhm rj ha sp nt=5 ri=180
#ex 
#ex  The above leads to scanning all "JHOME" subdirectories on Hadar
#ex  and Spica, up to 5 times at 3 minutes interval, and report on 
#ex  restartable jhome jobs.
#ex 
#ex   mntrjhm rj ha \
#ex     dirlist=/fs/dev/crb/had02/data/JHOME_abc_2301m01_2310m12_20140521060253_9437216 
#ex 
#ex  The above will result in confining the check on Hadar to "dirlist".
#ex 
#ex    mntrjhm rj ha filter='abc|def'
#ex  
#ex  Do the check on Hadar and limit the output to lines containing "abc" 
#ex  or "def" (runid) substring occurrence in them.
#ex 
#ex    mntrjhm rj ha own user=acrnxyz filter="abc|def"
#ex  
#ex  Similar to the previous example with output filtered by "user" 
#ex  and "filter".
#ex  

# Check and possibly adjust for parameters specified on the 
# script call ...

unset target
unset ooptn
while [ $# -gt 0 ]
  do
    case "$1" in
    -*) set $1 ; shift ;;
    nt=*) nt=`echo $1 | sed -e 's/nt=//'` ; shift ;;
    ri=*) ri=`echo $1 | sed -e 's/ri=//'` ; shift ;;
    fl|ar|ars|hs|ns|sns|na|sna|pb|npb|rj) if [ -z "$ooptn" ] ; then
                         ooptn="$1" ; shift 
                        else
                         echo "Only one of: fl/ar/ars/hs/ns/sns/na/sna/pb/npb/rj is allowed!"
                         exit 1
                        fi ;;
    filter=*) filter=`echo $1 | sed -e 's/filter=//'` ; shift ;;
    dirlist=*) Optn="$1" ; shift ;;
    user=*) user=`echo $1 | sed -e 's/user=//'` ; shift ;;
    own) own=yes ; shift ;;
    *) target="$target $1" ; shift ;;
    esac
  done

# Set the defaults.
 
ooptn=${ooptn:='fl'}
if [ "$ooptn" = 'rj' ] ; then
 nt=${nt:='100'}
else
 nt=${nt:='1'}
fi
ri=${ri:='300'}
own=${own:='no'}
user=${user:=$USER}
if [ "$own" = 'yes' ] ; then
 # ofilter="egrep '""$user""'"
 ofilter="egrep $user"
else
 ofilter="cat"
fi

if [ "$SITE_ID" = 'Dorval' -a "$OS" = 'AIX' ] ; then
 HOSTIDf=`echo $HOSTID | cut -c 1-3`
 case $HOSTIDf in
   c1f*|c1h*|c1r*|c1s* ) lclhost='spica' ; target=${target:='spica'}  ;;
   c2f*|c2h*|c2r*|c2s* ) lclhost='hadar' ; target=${target:='hadar'}  ;;
    * )  lclhost='hadar' ; target=${target:='hadar'}  ;;
 esac
else
   # target=${target:='rigel maia naos'}
   target=${target:='hadar'}
   lclhost="$HOSTID"
fi

# Setup the list of platforms to check based on "target" setting.

# nodes_2chk='maia naos rigel'
unset nodes_2chk
for pltfrm in $target
 do
  case $pltfrm in
   ha|hadar) nodes_2chk="$nodes_2chk hadar"  ;;
   sp|spica) nodes_2chk="$nodes_2chk spica"  ;;
        * ) echo "Skipped invalid entry: ${pltfrm}!" ;;
  esac
 done

# setup command(s) to execute based on specified option captured
# in "ooptn":

  case $ooptn in
   fl) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log" ;;
   ar) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_autorestartable" ;;
  ars) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_autoresub" ;;
   hs) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_autoresub_historical" ;;
   ns) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_needsubmission" ;;
  sns) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_simplifiedneedsubmission" ;;
   na) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_needattention" ;;
  sna) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_simplifiedneedattention" ;;
   pb) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_needattention | egrep 'As of|post_block' " ;;
  npb) cmd="cat $CCRNINFO/\${nod}_jhome_chk_log_needattention | egrep -v post_block " ;;
   rj) cmd="chkcjhm $Optn" ;;
    *) : ;;
  esac

#   ****   Task of the script...   ****

if [ -n "$nodes_2chk" ] ; then

 while [ $nt -gt 0 ]
 do
 
   [ "$ooptn" = 'rj' ] && echo "" && echo "As of `date`:" && echo "" || echo ""
   [ "$ooptn" = 'hs' ] && echo "" && echo "As of `date`:" && echo "" || echo ""
  for nod in $nodes_2chk
    do
     if [ "$ooptn" = 'rj' ] ; then
      echo "=== ${nod} check:"
      if [ "$nod" = "$lclhost" ] ; then
       if [ -n "$filter" ] ; then
        # eval "$cmd" | sed -e 's/-------------$//' | sed -n -e '1,2p' -e '3,$s/'"$filter"'/'"$filter"'/p'
        eval "$cmd" | $ofilter | egrep "$filter"
       else
        eval "$cmd" | $ofilter
       fi
      else 
       if [ -n "$filter" ] ; then
        # ssh ${nod} "${cmd}" | sed -e 's/-------------$//' | sed -n -e '1,2p' -e '3,$s/'"$filter"'/'"$filter"'/p'
        ssh ${nod} "${cmd}" | $ofilter | egrep "$filter"
       else
        ssh ${nod} "${cmd}" | $ofilter 
       fi
      fi
     elif [ "$ooptn" = 'hs' ] ; then
      echo "=== On ${nod}:" ; echo ""
      ( echo '| ACCOUNT |   RUNID   |     Chunk       |      Date of autosubmission      |' ; echo '| ------- |   -----   |     -----       |      ----------------------      |' ) | $AWK -F '|' '{ printf "%s%s%s%s%s%s%s%s%s\n","| ",$2," | ",$3," | ",$4," | ",$5," |" ;}' 
      echo ' '
      if [ -n "$filter" ] ; then
       # eval "$cmd" | sed -e 's/-------------$//' | sed -n -e '1,2p' -e '3,$s/'"$filter"'/'"$filter"'/p'
       # eval "$cmd" | $ofilter | egrep "$filter"
       eval "$cmd" | egrep 'Last resubmission at' | $AWK -F '|' '{ printf "%s%s%s%s%s%s%s%s%s\n","| ",$2," | ",$3," | ",$4," | ",$11," |" ;}' | sed -e 's/ Last resubmission at//' | sed -e 's/) *|$/) |/' | $ofilter | egrep "$filter"
      else
       eval "$cmd" | egrep 'Last resubmission at' | $AWK -F '|' '{ printf "%s%s%s%s%s%s%s%s%s\n","| ",$2," | ",$3," | ",$4," | ",$11," |" ;}' | sed -e 's/ Last resubmission at//' | sed -e 's/) *|$/) |/' | $ofilter
      fi
     else
      echo "=== On ${nod}:" ; echo ""
      if [ -n "$filter" ] ; then
       # eval "$cmd" | sed -e 's/-------------$//' | sed -n -e '1,2p' -e '3,$s/'"$filter"'/'"$filter"'/p'
       eval "$cmd" | $ofilter | egrep "$filter"
      else
       eval "$cmd" | $ofilter
      fi
      # nt=1
     fi
 #    echo "************************************************************" ; echo ""
 #    echo "" ; echo ""
     echo ""
    done
  nt=`expr $nt - 1`
  if [ $nt -ge 1 ] ; then
  #echo "" ; echo " Remaining checks: ${nt} times at ${ri} sec. interval " ; echo ""
   echo " Remaining checks: ${nt} times at ${ri} sec. interval " ; echo ""
   sleep ${ri}
  fi
 done
else
 echo 'A valid target cluster needs to be specified!'
 exit 1
fi
