#!/bin/bash
#
#   Usage: cmprs [options] rs1 rs2
#
# Purpose: Compare two restart files
#
# The restart files may be from the AGCM or the coupler. If rs1 and rs2 are netcdf files
# or tar files then it is assumed that they are coupler restart files. Otherwise it is
# assumed that rs1 and rs2 are AGCM restart files.
#
# Options:
# All options begin with a dash (-) and must appear before any other command line arg
#   -k  ..keep the execution directory created by this program
#         This dir will contain difference files, etc, that may be of interest to the user
#   -v  ..increase verbosity (multiple "v" options are cumulative)
#   -h  ..display a usage message and exit
#   -x  ..echo all shell commands (used for debugging)
#
########################################################################
#
# Larry Solheim ...Sep 2016
#
# Definitions:
#   verbose=INT  ..increase the amount of output produced by this program

FULLPATH=`type $0|awk '{print $3}'` # pathname of this script
Runame=`basename $FULLPATH`
usage() {
  err_exit=0
  while getopts e opt; do
    case $opt in
      e) err_exit=1 ;;
    esac
  done
  shift `expr $OPTIND - 1`

  [ -n "$1" ] && echo >&2 "${Runame}:" "$@"
  echo >&2 " "
  sed >&2 -n '/^###/q; s/^#$/# /; s/^ *$/# /; 3,$s/^# //p;' "$FULLPATH"
  if [ $err_exit -eq 0 ]; then
    exit
  else
    exit 1
  fi
}

# Create time stamp to be used in file names etc
stamp=`date "+%j%H%M%S"$$`

bail(){
  echo "${Runame}: *** ERROR *** $*"
  exit 1
}

# Set defaults
verbose=0
exit_status=0
keep_tmpd=0

# process command line options
while getopts hkvx opt
do
  case $opt in
    v) verbose+=1 ;;
    k) keep_tmpd=1 ;;
    h) usage  ;;
    x) set -x ;;
    -) shift; break ;; # end of options
    ?) usage -e $USAGE   ;;
  esac
done
shift `expr $OPTIND - 1`

# Add any definitions found on the command line to the current environment
for arg in "$@"; do
  case $arg in
    *=*) var=$(echo $arg|awk -F\= '{printf "%s",$1}' -)
         val=$(echo "$arg"|awk '{i=index($0,"=")+1;printf "%s",substr($0,i)}' -)
         # add this variable definition to the current environment
         [ -n "$var" ] && eval ${var}=\"\$val\"  # preserve quoted assignments
         val=$(echo $val|sed 's/^ *//; s/ *$//')  # remove leading and trailing space
         [ -z "$val" ] && bail "Invalid command line arg --> $arg <-- Empty value."
         ;;
  esac
done

# There should be 2 command line args remaining, namely 2 restart file names
rs1=$1
rs2=$2
[ -z "$rs1" -o -z "$rs2" ] && usage "Two restart file names are required on the command line"

full_path_rs1=''
rs1_dir=''
if [ -d $rs1 ]; then
  rs1_dir=$(readlink -f $rs1 2>/dev/null)
  [ -z "$rs1_dir" ] && usage "Unable to determine directory path for $rs1"
else
  [ -s $rs1 ] && full_path_rs1=$(readlink -f $rs1 2>/dev/null)
fi

full_path_rs2=''
rs2_dir=''
if [ -d $rs2 ]; then
  rs2_dir=$(readlink -f $rs2 2>/dev/null)
  [ -z "$rs2_dir" ] && usage "Unable to determine directory path for $rs2"
else
  [ -s $rs2 ] && full_path_rs2=$(readlink -f $rs2 2>/dev/null)
fi

# Create and change to a temporary directory with enough space to hold the files created here
CURR_DIR=$PWD
TMP_DIR=$CCRNTMP/tmp_cmprs_${UID}_$stamp
[ -d $TMP_DIR ] || mkdir $TMP_DIR
cd $TMP_DIR

rs1_is_tar=0
if [ -z "$rs1_dir" ]; then
  # This is not a directory
  # If a restart file was supplied as a full pathname on the command line then create a link to it
  if [ -n "$full_path_rs1" ]; then
    # Ensure rs1 does not contain pathname components and strip any trailing numeric extension
    rs1=$(basename ${rs1%.[0-9][0-9][0-9]})
    ln -s $full_path_rs1 $rs1
  fi

  # If the file is empty or does not exist then try an access
  [ -L $rs1 -o -s $rs1 ] || { access $rs1 $rs1 na; }
  [ -L $rs1 -o -s $rs1 ] || bail "$rs1 does not exist locally or on DATAPATH"

  # Determine if the restarts is a tar archives
  [ -n "$(file $rs1 | sed -n '/tar *archive/p' 2>/dev/null)" ] && rs1_is_tar=1
else
  # This is a directory
  # ---TODO--- what to do if either input file is a directory (assume nemo restart?)
  bail "Directory as input file is not currently supported"
fi

rs2_is_tar=0
if [ -z "$rs2_dir" ]; then
  # This is not a directory
  # If a restart file was supplied as a full pathname on the command line then create a link to it
  if [ -n "$full_path_rs2" ]; then
    # Ensure rs2 does not contain pathname components and strip any trailing numeric extension
    rs2=$(basename ${rs2%.[0-9][0-9][0-9]})
    ln -s $full_path_rs2 $rs2
  fi

  # If the file is empty or does not exist then try an access
  [ -L $rs2 -o -s $rs2 ] || { access $rs2 $rs2 na; }
  [ -L $rs2 -o -s $rs2 ] || bail "$rs2 does not exist locally or on DATAPATH"

  # Determine if the restart is a tar archives
  [ -n "$(file $rs2 | sed -n '/tar *archive/p' 2>/dev/null)" ] && rs2_is_tar=1
else
  # This is a directory
  # ---TODO--- what to do if either input file is a directory (assume nemo restart?)
  bail "Directory as input file is not currently supported"
fi

# rs1 and/or rs2 may get reassigned below ...save the original names here
rs1_real_name=$rs1
rs2_real_name=$rs2

# Identify the type of restart
rs1_type=0
rs2_type=0

if [ $rs1_is_tar -eq 1 ]; then
  # Extract a restart from this tar archive
  found_rs=0
  # Attempt to extract a coupler restart from this archive
  tarx_out=$(tar xf $rs1 cplrs_cpl_restart.nc 2>&1)
  if [ -s cplrs_cpl_restart.nc ]; then
    # This is a coupler restart archive
    rs1_type=1
    # Rename the restart to have a unique name and reassign rs1
    rs1=cpl_restart_1.nc
    mv cplrs_cpl_restart.nc $rs1
    found_rs=1
  else
    # Attempt to extract a NEMO restart from this archive
    mkdir nemo_rs1 || bail "Unable to create subdir nemo_rs1"
    cd nemo_rs1
    tarx_out=$(tar xf ../$rs1 2>&1)
    # NEMO ocean restart file names are of the form *_restart_[0-9][0-9][0-9][0-9].nc
    # NEMO ice restart file names are of the form *_restart_ice_[0-9][0-9][0-9][0-9].nc
    # NEMO tracer restart file names are of the form *_restart_trc_[0-9][0-9][0-9][0-9].nc
    rs1_ocn=$(ls -1 *_restart_[0-9][0-9][0-9][0-9].nc 2>/dev/null)
    rs1_ice=$(ls -1 *_restart_ice_[0-9][0-9][0-9][0-9].nc 2>/dev/null)
    rs1_trc=$(ls -1 *_restart_trc_[0-9][0-9][0-9][0-9].nc 2>/dev/null)
    [ -n "$rs1_ocn" -o -n "$rs1_ice" -o -n "$rs1_trc" ] && found_rs=1
    rs1_dir=nemo_rs1
    rs1_type=2
    cd ..
  fi
  [ $found_rs -eq 1 ] || bail "Unable to extract a restart from the archive $rs1"
fi

if [ $rs2_is_tar -eq 1 ]; then
  # Extract a restart from this tar archive
  found_rs=0
  # Attempt to extract a coupler restart from this archive
  tarx_out=$(tar xf $rs2 cplrs_cpl_restart.nc 2>&1)
  if [ -s cplrs_cpl_restart.nc ]; then
    # This is a coupler restart archive
    rs2_type=1
    # Rename the restart to have a unique name and reassign rs2
    rs2=cpl_restart_2.nc
    mv cplrs_cpl_restart.nc $rs2
    found_rs=1
  else
    # Attempt to extract a NEMO restart from this archive
    mkdir nemo_rs2 || bail "Unable to create subdir nemo_rs1"
    cd nemo_rs2
    tarx_out=$(tar xf ../$rs2 2>&1)
    # NEMO ocean restart file names are of the form *_restart_[0-9][0-9][0-9][0-9].nc
    # NEMO ice restart file names are of the form *_restart_ice_[0-9][0-9][0-9][0-9].nc
    # NEMO tracer restart file names are of the form *_restart_trc_[0-9][0-9][0-9][0-9].nc
    rs2_ocn=$(ls -1 *_restart_[0-9][0-9][0-9][0-9].nc 2>/dev/null)
    rs2_ice=$(ls -1 *_restart_ice_[0-9][0-9][0-9][0-9].nc 2>/dev/null)
    rs2_trc=$(ls -1 *_restart_trc_[0-9][0-9][0-9][0-9].nc 2>/dev/null)
    [ -n "$rs2_ocn" -o -n "$rs2_ice" -o -n "$rs2_trc" ] && found_rs=1
    rs2_dir=nemo_rs2
    rs2_type=2
    cd ..
  fi
  [ $found_rs -eq 1 ] || bail "Unable to extract a restart from the archive $rs2"
fi

# Determine if restarts are netcdf files
rs1_is_netcdf=1
ncdump_out=$(ncdump -k $rs1 2>/dev/null) || rs1_is_netcdf=0
rs2_is_netcdf=1
ncdump_out=$(ncdump -k $rs2 2>/dev/null) || rs2_is_netcdf=0

[ $rs1_is_netcdf -eq 1 ] && rs1_type=1
[ $rs2_is_netcdf -eq 1 ] && rs2_type=1

case "${rs1_type},$rs2_type" in
  0,0) # AGCM restarts
       separmc $rs1 RS1_$stamp y_$stamp z_$stamp >/dev/null
       separmc $rs2 RS2_$stamp y_$stamp z_$stamp >/dev/null
       rm -f y_$stamp z_$stamp
       files_differ=0
       cmp RS1_$stamp RS2_$stamp 2>&1 >/dev/null || { echo "Files differ"; files_differ=1; }
       if [ $files_differ -eq 1 ]; then
         exit_status=1
         diff_file=diff_${rs1}--${rs2}
         rm -f $diff_file
         sub RS1_$stamp RS2_$stamp $diff_file >/dev/null
         if [ $verbose -gt 0 ]; then
           echo "Difference in each variable between files" > ggstat_out
           echo "    A) $rs1" >> ggstat_out
           echo "    B) $rs2" >> ggstat_out
           echo " "           >> ggstat_out
           ggstat $diff_file  >> ggstat_out
           less ggstat_out
         fi
         if [ $keep_tmpd -eq 1 ]; then
           echo "Created difference file $diff_file"
         fi
       else
         echo "Files are identical"
       fi
       # Always remove the temporary restart files created by separmc
       rm -f RS1_$stamp RS2_$stamp
       ;;

  1,1) # Coupler restarts
       diff_file=diff_${rs1}--${rs2%.nc}.nc
       rm -f $diff_file
       ncdiff $rs1 $rs2 $diff_file

       avg_diff_file=avg_diff_${rs1}--${rs2%.nc}.nc
       rm -f $avg_diff_file
       ncwa $diff_file $avg_diff_file

       avg_rs1_file=avg_${rs1%.nc}.nc
       rm -f $avg_rs1_file
       ncwa $rs1 $avg_rs1_file

       avg_rs2_file=avg_${rs2%.nc}.nc
       rm -f $avg_rs2_file
       ncwa $rs2 $avg_rs2_file

       # Create a text file containing average values for variables in rs1 and rs2
       # Note that these need to be sorted lexographically to be consistent with
       # the average difference text files that are created below
       avg_rs1_var_list=avg_vars_${rs1%.nc}.txt
       ncdump $avg_rs1_file |
           sed '1,/^data:/d; /^ *$/d; /^ *lon_atm/d; /^ *lat_atm/d;
                /^ *lon_ocn/d; /^ *lat_ocn/d; /^ *[}] *$/d;' | sort > $avg_rs1_var_list
       echo "--- " >> $avg_rs1_var_list

       avg_rs2_var_list=avg_vars_${rs2%.nc}.txt
       ncdump $avg_rs2_file |
           sed '1,/^data:/d; /^ *$/d; /^ *lon_atm/d; /^ *lat_atm/d;
                /^ *lon_ocn/d; /^ *lat_ocn/d; /^ *[}] *$/d;' | sort > $avg_rs2_var_list
       echo "--- " >> $avg_rs2_var_list

       # Determine the fractional average difference relative to rs1
       avg_frac_diff_file=avg_frac_diff_${rs1}--${rs2%.nc}.nc
       rm -f $avg_frac_diff_file
       ncbo --op_typ=dvd $avg_diff_file $avg_rs1_file $avg_frac_diff_file
       avg_frac_diff_var_list=avg_frac_diff_vars_${rs1}--${rs2}.txt
       ncdump $avg_frac_diff_file |
           sed '1,/^data:/d; /^ *$/d; /^ *lon_atm/d; /^ *lat_atm/d;
                /^ *lon_ocn/d; /^ *lat_ocn/d; /^ *[}] *$/d;' > $avg_frac_diff_var_list
       sum_frac_avg=$(awk 'BEGIN {avg=0.0; n=0.0}
                      {if($NF==";"){i=NF-1;if($i=="NaN"){next};if($i<0.0){p=-$i}else{p=$i};avg+=p;n+=1}}
                      END {print avg/n}' $avg_frac_diff_var_list)
       echo "--- Avg over all vars is $sum_frac_avg" >> $avg_frac_diff_var_list

       # Determine the sum of abs(avg diff of each variable in the file)
       # If this is zero then the files are identical
       # If this is not zero then the files may be the same (due to round off, order of sum)
       # but are likely not
       avg_diff_var_list=avg_diff_vars_${rs1}--${rs2}.txt
       ncdump $avg_diff_file |
           sed '1,/^data:/d; /^ *$/d; /^ *lon_atm/d; /^ *lat_atm/d;
                /^ *lon_ocn/d; /^ *lat_ocn/d; /^ *[}] *$/d;' > $avg_diff_var_list
       sum_avg=$(awk 'BEGIN {avg=0.0; n=0.0}
                      {if($NF==";"){i=NF-1;if($i=="NaN"){next};if($i<0.0){p=-$i}else{p=$i};avg+=p;n+=1}}
                      END {print avg/n}' $avg_diff_var_list)
       echo "--- Avg over all vars is $sum_avg" >> $avg_diff_var_list

       # Combine these average difference text files
       avg_diff_list=avg_diff_list.txt
       echo "Average difference in each variable between files" > $avg_diff_list
       echo "    A) $rs1_real_name" >> $avg_diff_list
       echo "    B) $rs2_real_name" >> $avg_diff_list
       awk ' FNR==1 {++fid}
            (fid==1){a[FNR]=$0;next}  # read first file
            (fid==2){b[FNR]=$0;next}  # read second file
            (fid==3){c[FNR]=$0;next}  # read third file
                    {d[FNR]=$0}       # read fourth file
            END{printf("\n%-45s %-45s %-45s %s","  Average Value (A)","  Average Value (B)",
                       "  Average Difference (A-B)","  Fractional Difference (A-B)/A");
                for(i=0;i<=NR/4;i++){printf("%-45s %-45s %-45s %s\n",a[i],b[i],c[i],d[i])} }' \
            $avg_rs1_var_list $avg_rs2_var_list $avg_diff_var_list $avg_frac_diff_var_list >> $avg_diff_list

       # Clean up temporary files
       rm -f $avg_rs1_file $avg_rs2_file
       rm -f $avg_rs1_var_list $avg_rs2_var_list $avg_diff_var_list $avg_frac_diff_var_list

       if [ "$sum_avg" = "0" ]; then
         echo "Files are identical"
       else
         exit_status=1
         echo "Files differ"
         if [ $verbose -gt 0 ]; then
           cat $avg_diff_list
           echo " "
         fi
         if [ $keep_tmpd -eq 1 ]; then
           echo "Created summary text file in  $avg_diff_list"
           echo "                     difference file  $diff_file"
           echo "  average absolute   difference file  $avg_diff_file"
           echo "  average fractional difference file  $avg_frac_diff_file"
         fi
       fi
       ;;

  2,2) # NEMO restarts
       [ "$rs1_ocn" != "$rs2_ocn" ] && bail "NEMO ocn restart names are different"
       [ "$rs1_ice" != "$rs2_ice" ] && bail "NEMO ice restart names are different"
       [ "$rs1_trc" != "$rs2_trc" ] && bail "NEMO trc restart names are different"
       rs1_list="$rs1_ocn $rs1_ice $rs1_trc"
       [ -z "$rs1_list" ] && bail "No NEMO restart files were found for $rs1"
       for nemors in $rs1_list; do
         f1=$rs1_dir/$nemors
         f2=$rs2_dir/$nemors
         [ -s $f1 ] || bail "$f1 is missing or empty"
         [ -s $f2 ] || bail "$f2 is missing or empty"
         echo "=========== diff $f1 $f2"
         cdo -diffn $f1 $f2
       done
       ;;

    *) bail "$rs1 and $rs2 are different file types" ;;
esac

# Return to the invocation dir and clean up the tmp dir
cd $CURR_DIR
if [ $keep_tmpd -eq 1 ]; then
  echo "Files are in a temporary directory at $TMP_DIR"
else
  rm -fr $TMP_DIR
fi

exit $exit_status
