#!/usr/bin/env perl
###############################################################################
# Create lists of file names found in DATAPATH or in a user supplied directory.
#
# File lists will be organized according to file type (e.g. time series, rtd,
# history, restart, diagnostic or pooled files). The user may control which
# files appear in these lists by supplying suffix lists for each file type.
# However, there are internal defaults for all suffix lists.
# In addition to these suffix lists the user must supply a runid and a date
# range, which will be used to extract the desired file names.
#
# All file lists are written as text files with one file per line and an
# additional file is created containing variables that identify the names of
# each of these output file lists.
#
# Larry Solheim Jan 2014
###############################################################################

  # ---Start_submit_ignore_code----
  use Cwd qw(abs_path);

  my $usage = "  Usage: data_file_lists runid [data_dir] [options]";
  $usage .= "\n    data_dir ...look in data_dir for all files (default DATAPATH)";
  $usage .= "\nOptions:  (LIST is a white space separated list of strings)";
  $usage .= "\n    --start=year:mon          ...the start of the date range";
  $usage .= "\n    --stop=year:mon           ...the end of the date range";
  $usage .= "\n    --range=YYYYmMM_YYYYmMM   ...the date range";
  $usage .= "\n      start/stop or range indicate the range of dates for which";
  $usage .= "\n      files will be selected.";
  $usage .= "\n      Either range or both start and stop are required.";
  $usage .= "\n    --all-pooled              ...list all pooled files found";
  $usage .= "\n      By default only multi year pooled files will be listed";
  $usage .= "\n    ts_suffix_list=LIST   ...suffixes for time series files";
  $usage .= "\n    rs_suffix_list=LIST   ...suffixes for restart files";
  $usage .= "\n    hist_suffix_list=LIST ...suffixes for history files";
  $usage .= "\n    diag_suffix_list=LIST ...suffixes for diagnostic files";
  $usage .= "\n    pool_suffix_list=LIST ...suffixes for pooled files";
  $usage .= "\n      These suffix lists all have internal defaults (are not mandatory).";
  $usage .= "\n      Files that have any suffix in the associated suffix list are selected.";
  $usage .= "\n\n.Information about all file lists created is written to a file named";
  $usage .= "\n    data_file_lists_output";

  # If there were no command line args then print usage info and exit
  unless ( scalar(@ARGV)>0 ) {
    warn "$usage\n";
    exit;
  }

  # Determine the current DATAPATH
  my $DATAPATH = "";
  if ( $ENV{DATAPATH} ) {
    # Define DATAPATH if it is defined in the current environment
    if ( -d "$ENV{DATAPATH}" ) {
      $DATAPATH = "$ENV{DATAPATH}";
    } else {
      die "DATAPATH=$ENV{DATAPATH} is not a directory\n";
    }
    # Follow any links
    $DATAPATH = readlink $DATAPATH if -l $DATAPATH;
    # Ensure this is an absolute pathname
    $DATAPATH = abs_path( $DATAPATH );

  } else {
    warn "DATAPATH is not defined in this environment.\n";
  }

  my $runid = '';
  my $data_dir = '';
  $verbose = 0;
  use vars qw( $data_dir $verbose );

  my ( @NonOpt, %usrvar );
  my $start = '';
  my $stop = '';
  my $ym_range = '';
  my $shift_rs_date_range = 1;
  my $multi_year_pooled_only = 1;

  # The set of command line parameters that may be defined by the user
  # These are all the keys that are allowed in %usrvar
  my @cmdl_params = ( "ts_suffix_list", "rs_suffix_list", "hist_suffix_list",
                      "diag_suffix_list", "pool_suffix_list" );

  use Getopt::Long;
  $Getopt::Long::order = $PERMUTE;
  &GetOptions("verbose"          => sub {$verbose++},
              "start=s"          => \$start,
              "stop=s"           => \$stop,
              "range=s"          => \$ym_range,
              "rs_shift!"        => \$shift_rs_date_range,
              "all-pooled!"      => sub {$multi_year_pooled_only=0},
              "<>"               => sub {push @NonOpt, $_[0]})
      or die "$usage\n";

  # Process non option command line args
  foreach (@NonOpt) {
    next unless $_;
    if (/=/) {
      # This is a variable assignment of the form "var=val"
      # Strip quotes, if any and assign $var and $val
      s/^\s*"(.*)"\s*$/$1/;
      s/^\s*'(.*)'\s*$/$1/;
      my ($var,$val) = /^\s*(.*?)=(.*)/;

      # Strip leading or trailing whitespace from the value
      $val =~ s/^\s*(.*?)\s*$/$1/;

      # Strip quotes from the value, if any
      $val =~ s/^\s*"(.*)"\s*$/$1/;
      $val =~ s/^\s*'(.*)'\s*$/$1/;

      unless ( $val eq "0" ) {
        # Replace NULL values with a single space
        # This will cause an empty list element to be pushed onto the list below
        $val = " " unless $val;
      }

      # Check for valid parameter names
      my $found = 0;
      foreach my $parm (@cmdl_params) {
        if ( $parm eq $var ) {
          $found = 1;
          last;
        }
      }
      unless ($found) {
        die "Invalid command line parameter --> $var <--\n$usage\n";
      }

      # Add variable defs found on the command line to the usrvar hash

      # Multiple variable assignments for the same variable will be added to a
      # list of values for that variable.
      # Each individual assignment will be split on whitespace and added
      # as a separate entry in the list of values.
      $val =~ s/^\s*(.*?)\s*$/$1/;
      push @{$usrvar{$var}}, split(/\s+/,$val);

      next;
    }

    # Any remaining command line args should be either runid or data_dir
    if ( $runid and $data_dir ) {
      # Both runid and data_dir have been seen
      die "Invalid comand line argument --> $_ <--\n";
    } elsif ( $runid ) {
      # runid is defined so this must be data_dir
      $data_dir = $_;
    } else {
      $runid = $_;
    }

  }

  # If a directory name was not supplied on the command line then use DATAPATH
  $data_dir = $DATAPATH unless $data_dir;

  die "runid must be defined.\n$usage\n" unless $runid;
  die "data_dir must be defined.\n$usage\n" unless $data_dir;
  die "$data_dir is not a directory.\n$usage\n" unless -d $data_dir;

  # Define default lists of suffixes for various file types
  # An empty list means use all suffixes found on disk
  my @ts_suffix_list = ();
  my @hist_suffix_list = ( "gs", "ss", "gz", "td" );
  my @rs_suffix_list = ( "rs", "cs", "os", "ts", "ab", "ob", "an" );
  my @diag_suffix_list =
      ( "gp", "xp", "dd", "dp", "dsd", "dcosp", "gp6", "xp6", "cc", "ie" );
  my @pool_suffix_list = ();

  # Redefine a suffix list if it was provided by the user
  if ( $usrvar{ts_suffix_list} ) {
    # If a command line definition is present use it
    @ts_suffix_list = @{$usrvar{ts_suffix_list}};
  } elsif ( $ENV{ts_suffix_list} ) {
    # If this is defined in the environment then use that value
    # Note that an empty list will result when the env variable
    # contains nothing but whitespace
    my ($envdef) = $ENV{ts_suffix_list} =~ /^\s*(.*?)\s*$/;
    @ts_suffix_list = split(/\s+/, $envdef);
  }

  if ( $usrvar{hist_suffix_list} ) {
    # If a command line definition is present use it
    @hist_suffix_list = @{$usrvar{hist_suffix_list}};
  } elsif ( $ENV{hist_suffix_list} ) {
    # If this is defined in the environment then use that value
    # Note that an empty list will result when the env variable
    # contains nothing but whitespace
    my ($envdef) = $ENV{hist_suffix_list} =~ /^\s*(.*?)\s*$/;
    @hist_suffix_list = split(/\s+/, $envdef);
  }

  if ( $usrvar{rs_suffix_list} ) {
    # If a command line definition is present use it
    @rs_suffix_list = @{$usrvar{rs_suffix_list}};
  } elsif ( $ENV{rs_suffix_list} ) {
    # If this is defined in the environment then use that value
    # Note that an empty list will result when the env variable
    # contains nothing but whitespace
    my ($envdef) = $ENV{rs_suffix_list} =~ /^\s*(.*?)\s*$/;
    @rs_suffix_list = split(/\s+/, $envdef);
  }

  if ( $usrvar{diag_suffix_list} ) {
    # If a command line definition is present use it
    @diag_suffix_list = @{$usrvar{diag_suffix_list}};
  } elsif ( $ENV{diag_suffix_list} ) {
    # If this is defined in the environment then use that value
    # Note that an empty list will result when the env variable
    # contains nothing but whitespace
    my ($envdef) = $ENV{diag_suffix_list} =~ /^\s*(.*?)\s*$/;
    @diag_suffix_list = split(/\s+/, $envdef);
  }

  if ( $usrvar{pool_suffix_list} ) {
    # If a command line definition is present use it
    @pool_suffix_list = @{$usrvar{pool_suffix_list}};
  } elsif ( $ENV{pool_suffix_list} ) {
    # If this is defined in the environment then use that value
    # Note that an empty list will result when the env variable
    # contains nothing but whitespace
    my ($envdef) = $ENV{pool_suffix_list} =~ /^\s*(.*?)\s*$/;
    @pool_suffix_list = split(/\s+/, $envdef);
  }

  if ( $ym_range ) {
    # The user has supplied ym_range
    # ym_range is assumed to be of the form YYYYmMM_YYYYmMM
    # where YYYY and MM are integer year and month
    ($y1,$m1,$y2,$m2) = $ym_range =~ /^\s*(\d+)m(\d+)_(\d+)m(\d+)\s*$/;

    # Note that this check will disallow any zero values
    die "Invalid ym_range = $ym_range.\n" unless ($y1 and $m1 and $y2 and $m2);
    if ( $start and $stop ) {
      warn "Both range and start/stop are defined. Using range = $ym_range\n";
    }
  } elsif ( $start and $stop ) {
    # The user has supplied a start and stop date
    ($y1,$m1) = split(/\s*:\s*/, $start);
    # The month will default to Jan for start
    $m1 = 1 unless $m1;
    # Note that this check will disallow any zero values
    die "Invalid start option $start\n" unless ($y1 and $m1);
    ($y2,$m2) = split(/\s*:\s*/, $stop);
    # The month will default to Dec for stop
    $m2 = 12 unless $m2;
    # Note that this check will disallow any zero values
    die "Invalid stop option $stop\n" unless ($y2 and $m2);
  } else {
    die "Either range or both start and stop must be defined.\n$usage\n";
  }

  # Check on date range values
  die "Invalid start year = $y1\n" if $y1 < 0;
  die "Invalid start month = $m1\n" unless ($m1>0 and $m1<13);
  die "Invalid stop year = $y2\n" if $y2 < 0;
  die "Invalid stop month = $m2\n" unless ($m2>0 and $m2<13);
  die "Invalid start_year=$y1 > stop_year=$y2\n" if $y1 > $y2;
  die "Invalid start_month=$m1 > stop_month=$m2\n" if ($y1==$y2 and $m1>$m2);

  # Get a sorted list of all files in the data directory
  opendir(DIR, $data_dir) or die "$! Stopped";
    # Ignore all ".*" entries
    @flist = sort grep !/^\./, readdir DIR;
  closedir(DIR);

  # Only file names containing a numeric suffix will be extracted below

  # mid will form part of the regex used to extract certain files below and is
  # used to match the middle part of files names (ie runid_YYYY_mMM)
  my $mid = "${runid}_" . '\d+_m\d\d';

  # Extract all time series files, associated with this runid, found in this dir
  if ( scalar(@ts_suffix_list) ) {
    # Only allow suffixes found in this list
    $sfx = join("|",@ts_suffix_list);
    @time_series_list_all = grep /^s._${runid}_\d+_\d+_($sfx)_.*(\.\d+)$/, @flist;
  } else {
    # Add all time series files found on disk
    @time_series_list_all = grep /^s._${runid}_\d+_\d+_.*(\.\d+)$/, @flist;
    # This will include rtd files, remove all rtd files from this list
    @time_series_list_all = grep !/_rtd\d*(\.nc)*(\.\d+)$/, @time_series_list_all;
  }

  # Extract all run time diagnostic files, associated with this runid, found in this dir
  @rtd_list_all = grep /^s._${runid}_\d+_\d+_rtd\d*(\.nc)*(\.\d+)$/, @flist;

  # Extract restart files for all years, associated with this runid, found in this dir

  # Define a list of months for which restart files will be listed
  if ( $ENV{rs_months_list} ) {
    my ($envdef) = $ENV{rs_months_list} =~ /^\s*(.*?)\s*$/;
    die "ENV{rs_months_list} is defined but only contains blanks" unless $envdef;
    @rs_months_list = split(/\s+/, $envdef);
  } else {
    @rs_months_list = ("$m2");
  }
  die "Empty rs_months_list" unless scalar(@rs_months_list);
  foreach (@rs_months_list) {
    unless (/^\d+/) {
      warn "rs_months_list = @rs_months_list\n";
      die "Month --> $_ <-- is not an integer in rs_months_list"
    }
    if ($_ < 1 or $_ > 12) {
      warn "rs_months_list = @rs_months_list\n";
      die "Month --> $_ <-- in rs_months_list is out of range";
    }
    $_ = sprintf("%2.2d", $_);
  }
  @rs_months_list = sort( @rs_months_list );

  # Extract restart files for each suffix and each month in rs_months_list
  if ( scalar(@rs_suffix_list) and scalar(@rs_months_list) ) {
    # Only allow months found in rs_months_list
    my $rsmon = join("|",@rs_months_list);
    # Only allow suffixes found in rs_suffix_list
    my $sfx = join("|",@rs_suffix_list);
    @restart_list_all = grep /^m._${runid}_\d+_m($rsmon)_($sfx)(\.\d+)$/, @flist;
  } else {
    # Add all restart files found on disk (note this will include history files)
    @restart_list_all = grep /^m._${runid}_\d+_m\d\d_.*(\.\d+)$/, @flist;
  }

  # Extract all history files, associated with this runid, found in this dir
  if ( scalar(@hist_suffix_list) ) {
    # Only allow suffixes found in this list
    $sfx = join("|",@hist_suffix_list);
    @hist_list_all = grep /^m._${mid}.*_($sfx)(\.\d+)$/, @flist;
  } else {
    # Add all history files found on disk (note this will include restart files)
    @hist_list_all = grep /^m._${mid}.*(\.\d+)$/, @flist; 
  }

  # Extract all diagnostic files, associated with this runid, found in this dir
  if ( scalar(@diag_suffix_list) ) {
    # Only allow suffixes found in this list
    $sfx = join("|",@diag_suffix_list);
    @diag_list_all = grep /^d._${mid}.*_($sfx)(\.\d+)$/, @flist;
  } else {
    # Add all diagnostic files found on disk
    @diag_list_all = grep /^d._${mid}.*(\.\d+)$/, @flist;
  }

  # Extract all pooled files found in this dir
  if ( scalar(@pool_suffix_list) ) {
    # Only allow suffixes found in this list
    $sfx = join("|",@pool_suffix_list);
    @pool_list_all = grep /^p._${runid}_.*_($sfx)(\.\d+)$/, @flist;
  } else {
    # Add all pooled files found on disk
    @pool_list_all = grep /^p._${runid}_.*(\.\d+)$/, @flist;
  }

  # Extract a subset of files from each list for the given date range
  my @time_series_list;
  my @rtd_list;
  my @restart_list;
  my @hist_list;
  my @diag_list;
  my @pool_list;

  for (my $year=$y1; $year<=$y2; $year++) {
    $year = sprintf("%3.3d",$year);
    # rtd files names are of the form .._runid_YYYY_YYYY_rtd\d*(\.\d+)
    push @rtd_list, grep /^.._${runid}_\d+_${year}.*(\.\d+)$/, @rtd_list_all;

    # Time series files are of the form .._runid_YYYY_YYYY_.*
    push @time_series_list,
        grep /^.._${runid}_\d+_${year}.*(\.\d+)$/, @time_series_list_all;

    if ( $multi_year_pooled_only ) {
      # List only mlti year pooled files ( this is the default)
      # Multi year pooled files are of the form .._runid_YYYYmMM_YYYYmMM_.*
      push @pool_list,
          grep /^.._${runid}_${year}m\d\d_\d+m\d\d_.*(\.\d+)$/, @pool_list_all;
    } else {
      # List all pooled files of the form .._runid_YYYY_.*
      push @pool_list,
          grep /^.._${runid}_${year}_.*(\.\d+)$/, @pool_list_all;
      # Include multi year pooled files of the form .._runid_YYYYmMM_YYYYmMM_.*
      push @pool_list,
          grep /^.._${runid}_${year}m\d\d_\d+m\d\d_.*(\.\d+)$/, @pool_list_all;
    }

    my $mon1 = 1;
    $mon1 = $m1 if $year == $y1;
    my $mon2 = 12;
    $mon2 = $m2 if $year == $y2;
    for (my $mon=$mon1; $mon<=$mon2; $mon++) {
      $mon = sprintf("%2.2d",$mon);
      # The remaining files are of the form .*_runid_YYYY_mMM_.*
      push @restart_list,
          grep /^.._${runid}_${year}_m${mon}_.*(\.\d+)$/, @restart_list_all;
      push @hist_list,
          grep /^.._${runid}_${year}_m${mon}_.*(\.\d+)$/, @hist_list_all;
      push @diag_list,
          grep /^.._${runid}_${year}_m${mon}_.*(\.\d+)$/, @diag_list_all;
    }
  }

  # Shift the date range back by one month and extract another set of lists
  # Do not do this for rtd or time series files since these both contain time
  # series data (ie not monthly data like other files)
  # Do not do this for pooled files since they contain average data
  my @restart_list_shifted;
  my @hist_list_shifted;
  my @diag_list_shifted;
  if ( $m1 == 1 ) {
    $rsm1 = 12;
    $rsy1 = $y1 - 1;
  } else {
    $rsm1 = $m1 - 1;
    $rsy1 = $y1;
  }
  if ( $m2 == 1 ) {
    $rsm2 = 12;
    $rsy2 = $y2 - 1;
  } else {
    $rsm2 = $m2 - 1;
    $rsy2 = $y2;
  }
  for (my $year=$rsy1; $year<=$rsy2; $year++) {
    $year = sprintf("%3.3d",$year);
    my $mon1 = 1;
    $mon1 = $rsm1 if $year == $rsy1;
    my $mon2 = 12;
    $mon2 = $rsm2 if $year == $rsy2;
    for (my $mon=$mon1; $mon<=$mon2; $mon++) {
      $mon = sprintf("%2.2d",$mon);
      # These files are of the form .*_runid_YYYY_mMM_.*
      push @restart_list_shifted,
          grep /^.._${runid}_${year}_m${mon}_.*(\.\d+)$/, @restart_list_all;
      push @hist_list_shifted,
          grep /^.._${runid}_${year}_m${mon}_.*(\.\d+)$/, @hist_list_all;
      push @diag_list_shifted,
          grep /^.._${runid}_${year}_m${mon}_.*(\.\d+)$/, @diag_list_all;
    }
  }

  # Open a file that will contain information about output file names
  my $lists_file = "data_file_lists_output";
  open(LISTS, ">$lists_file") or die "Unable to open $lists_file   $!\n";

  # Create text files containing time series file lists
  $pfx = def_pfx("sx", @time_series_list);
  $out_list      = "${pfx}_time_series_files";
  $out_list_full = "${pfx}_time_series_files_full_path";
  proc_sublist($out_list, $out_list_full, @time_series_list);
  if ( scalar(@time_series_list) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "ts_files=$out_list\n";
    print LISTS "ts_files_full=$out_list_full\n";
    print LISTS "ts_files_pfx=$pfx\n";
  }

  # Create text files containing rtd file lists
  $pfx = def_pfx("sx", @rtd_list);
  $out_list      = "${pfx}_rtd_files";
  $out_list_full = "${pfx}_rtd_files_full_path";
  proc_sublist($out_list, $out_list_full, @rtd_list);
  if ( scalar(@rtd_list) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "rtd_files=$out_list\n";
    print LISTS "rtd_files_full=$out_list_full\n";
    print LISTS "rtd_files_pfx=$pfx\n";
  }

  # Create text files containing restart file lists
  $pfx = def_pfx("mx", @restart_list);
  $out_list      = "${pfx}_restart_files";
  $out_list_full = "${pfx}_restart_files_full_path";
  proc_sublist($out_list, $out_list_full, @restart_list);
  if ( scalar(@restart_list) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "rs_files=$out_list\n";
    print LISTS "rs_files_full=$out_list_full\n";
    print LISTS "rs_files_pfx=$pfx\n";
  }
  $pfx = def_pfx("mx", @restart_list_shifted);
  $out_list      = "${pfx}_restart_files_shifted";
  $out_list_full = "${pfx}_restart_files_shifted_full_path";
  proc_sublist($out_list, $out_list_full, @restart_list_shifted);
  if ( scalar(@restart_list_shifted) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "rs_files_shifted=$out_list\n";
    print LISTS "rs_files_shifted_full=$out_list_full\n";
    print LISTS "rs_files_shifted_pfx=$pfx\n";
  }

  # Create text files containing history file lists
  $pfx = def_pfx("mx", @hist_list);
  $out_list      = "${pfx}_hist_files";
  $out_list_full = "${pfx}_hist_files_full_path";
  proc_sublist($out_list, $out_list_full, @hist_list);
  if ( scalar(@hist_list) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "hist_files=$out_list\n";
    print LISTS "hist_files_full=$out_list_full\n";
    print LISTS "hist_files_pfx=$pfx\n";
  }
  $pfx = def_pfx("mx", @hist_list_shifted);
  $out_list      = "${pfx}_hist_files_shifted";
  $out_list_full = "${pfx}_hist_files_shifted_full_path";
  proc_sublist($out_list, $out_list_full, @hist_list_shifted);
  if ( scalar(@hist_list_shifted) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "hist_files_shifted=$out_list\n";
    print LISTS "hist_files_shifted_full=$out_list_full\n";
    print LISTS "hist_files_shifted_pfx=$pfx\n";
  }

  # Create text files containing diagnostic file lists
  $pfx = def_pfx("dx", @diag_list);
  $out_list      = "${pfx}_diag_files";
  $out_list_full = "${pfx}_diag_files_full_path";
  proc_sublist($out_list, $out_list_full, @diag_list);
  if ( scalar(@diag_list) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "diag_files=$out_list\n";
    print LISTS "diag_files_full=$out_list_full\n";
    print LISTS "diag_files_pfx=$pfx\n";
  }
  $pfx = def_pfx("dx", @diag_list_shifted);
  $out_list      = "${pfx}_diag_files_shifted";
  $out_list_full = "${pfx}_diag_files_shifted_full_path";
  proc_sublist($out_list, $out_list_full, @diag_list_shifted);
  if ( scalar(@diag_list_shifted) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "diag_files_shifted=$out_list\n";
    print LISTS "diag_files_shifted_full=$out_list_full\n";
    print LISTS "diag_files_shifted_pfx=$pfx\n";
  }

  # Create text files containing pooled file lists
  $pfx = def_pfx("px", @pool_list);
  $out_list      = "${pfx}_pool_files";
  $out_list_full = "${pfx}_pool_files_full_path";
  proc_sublist($out_list, $out_list_full, @pool_list);
  if ( scalar(@pool_list) ) {
    # Add info about these output file names to the LISTS file
    print LISTS "pool_files=$out_list\n";
    print LISTS "pool_files_full=$out_list_full\n";
    print LISTS "pool_files_pfx=$pfx\n";
  }

  close(LISTS);

  sub def_pfx {
    # Given a list of file names extract the most common file name prefix
    # The prefix is defined as whatever appears before the first underscore
    # but is set to a user supplied default value if the file name contains
    # no underscores
    use strict;

    my $default_pfx = shift;
    die "Missing default prefix. Stopped" unless $default_pfx;

    unless ( scalar(@_) ) {
      # If the file list is empty then return the user supplied default value
      return $default_pfx;
    }

    my %pfx;
    foreach my $fname ( @_ ) {
      # Extract the prefix up to the first underscore
      my ($curr_pfx) = $fname =~ /^(.*?)_/;
      next unless $curr_pfx;
      next if $curr_pfx eq $fname;
      $pfx{$curr_pfx} = 0 unless $pfx{$curr_pfx};
      $pfx{$curr_pfx}++;
    }

    my $new_pfx = "";
    my $count = 0;
    foreach ( keys %pfx ) {
      if ($pfx{$_} > $count) {
        $new_pfx = $_;
        $count = $pfx{$_};
      }
    }

    return $new_pfx ? $new_pfx : $default_pfx;
  }

  sub proc_sublist {
    # Create a disk file containing a list of files, one file name per line
    # The name of the output file list as well as the file names that go into
    # the list are passed in via the command line
    # Duplicate files (equivalent except for a numeric suffix) are removed
    # and only the file with the highest numeric suffix is retained
    use strict;
    my $list_name = shift;
    die "Missing list name. Stopped" unless $list_name;
    my $full_path_list_name = shift;
    die "Missing list name. Stopped" unless $full_path_list_name;

    unless ( scalar(@_) ) {
      # If the file list is empty then simply remove list_name
      unlink $list_name, $full_path_list_name;
      return 1;
    }

    my %hlist;
    foreach my $fname ( @_ ) {
      # Use the file name without any numeric suffix as a hash key to filter
      # out multiple numeric suffixes on the same file name
      # Since the original list was sorted, this will cause retention of the
      # the file with the highest numeric suffix and throw away the others
      my ($nosfx) = $fname =~ /^(.*?)(?:\.\d+)?$/;
      $hlist{$nosfx} = $fname;
    }

    # Create a list with just file base names and any numeric suffix removed
    print "$list_name\n" if $verbose;
    open(SUBFILES, ">$list_name") or die "$! Stopped";
    foreach my $fn ( sort keys %hlist ) {
      print SUBFILES "$fn\n";
      print "\t$fn\n" if $verbose;
    }
    close(SUBFILES);

    # Create a list of full path names, including any numeric suffix
    open(SUBFILES, ">$full_path_list_name") or die "$! Stopped";
    foreach my $fn ( sort keys %hlist ) {
      print SUBFILES "$data_dir/$hlist{$fn}\n";
    }
    close(SUBFILES);

    return 1;
  }

