#!/usr/bin/env perl
########################################################################
# Get a list of files found in the cfs archive database that match a
# pattern (or patterns) that are supplied on the command line.
# These patterns may contain shell wild cards (which must be quoted
# againsts shell expansion).
#
# Larry Solheim Apr 2010
#
# $Id: lsarc 655 2011-08-04 23:04:36Z acrnrls $
########################################################################

require 5;
use File::Find ();
use Cwd ();

# Declare global variables
use vars qw(%CFSDB %DUPLICATE %env @NonOpt &init_cfsdb $verbose
           $dump_arcdat $masterdir $cfsuser $show_duplicates $all_master_dir
           $CFSDATA $short_term_dir $long_term_dir $others_dir $location
           %individual @allpat $match_all @indv_files $ROUTE_SSH);

# Used by find below
use vars qw/*find_name *find_dir *find_prune $find_cwd/;
*find_name   = *File::Find::name;
*find_dir    = *File::Find::dir;
*find_prune  = *File::Find::prune;
$find_cwd    = Cwd::cwd();
sub doexec ($@);

chomp($Runame = `basename $0`);

use Sys::Hostname;
my $host = hostname();
$host =~ s/^(.*?)\..*$/$1/;
# Define an alias for the current host and determine kernel type
# Optionally define a prefix for all ssh commands
# to reroute through a head node when required
my $mach_name = "";
my $mach_type = "";
my $ROUTE_SSH = "";
if ($host =~ /^lx/) {
  # Assume this is a machine in Victoria
  $mach_name = $host;
  $mach_type = "linux";
} elsif ( $host =~ /^joule/ ) {
  $mach_name = "joule";
  $mach_type = "linux";
} elsif ( $host =~ /^ib3/ ) {
  $mach_name = "pollux";
  $mach_type = "linux";
  $ROUTE_SSH = "ssh pollux";
} elsif ( $host =~ /^c1/ ) {
  $mach_name = "spica";
  $mach_type = "aix";
  $ROUTE_SSH = "ssh spica";
} elsif ( $host =~ /^c2/ ) {
  $mach_name = "hadar";
  $mach_type = "aix";
  $ROUTE_SSH = "ssh hadar";
} else {
  die "** EE ** Unrecognized host $host\n";
}

$verbose = 0;
$quiet = 0;

# $prex = 1 means assume that each pattern is a perl regular expressions,
# otherwise each pattern will/may contain shell wildcards
$prex = 0;

# Read the cfs database then dump it to stdout
# This is used internally for reading the database on a remote machine
# but may be invoked by the user.
$dump_arcdat = 0;

# Read entries from the "offical" directories in the cfs database
# By default the user directory is read
$masterdir = 0;

# Read entries in the "others" dir but only when $masterdir is true.
$others_dir = 0;

# Read all entries in the masterdir, inlcuding the others dir and all runid dirs
$all_master_dir = 0;

# Look for individual files that do not belong to and arcfile. These files are
# treated differently from those in an arcfile and so a different location below
# ~acrnsrc/arc_dir (the archive database) must be searched to find them.
$individual{check}        = 0;
$individual{check_active} = 0;
$individual{cutoff_year}  = undef;
$individual{cutoff_day}   = undef;
$individual{read_arc}     = 0;

# with_individual is a boolean flag to determine whether or not the CFSDB hash
# will be reinitialized to search for individual files after the initial search
$with_individual = 0;

# short_term_dir and long_term_dir are used when accessing a users archive.
# They flag inclusion of these dirs in the search of the CFS database
$short_term_dir = 1;
$long_term_dir  = 1;

# When CFSDATA is true then *only* the file ~/info/CFSDATA will be searched
# to create the cfs data base.
# If this file is missing or empty the job will abort.
$CFSDATA = 0;

# cfsuser is only used when $masterdir is false. It identifies the owner
# of the files on cfs that will be searched for patterns.
chomp($cfsuser = `whoami`);

# Read patterns from environment variables with names of the form file1, file2,...
$read_env = 0;

# If $count_file is defined and non null then create a file named $count_file
# that contains the number of hits for all patterns matches
# This is intended for use in batch strings
$count_file = "";

# $match_all is a boolean flag used to determine if patterns supplied on the
# command line will be "and"ed together so that each hit will match all patterns.
# The default is to "or" patterns so that each hit will match any pattern.
$match_all = 0;

# Show duplicate entries in the cfs database when they are found
$show_duplicates = 1;

# single_column_output is a logical flag to indicate that file names found should
# be sent to stdout alone, one file name per line. This is useful when creating a
# list of files to load, particularly for use with qload
$single_column_output = 0;

# Define a usage function
$Usage = sub {
  my ($msg)=@_;
  if ($msg) {print "${Runame}: $msg\n"};
  print <<EOR;
  Usage: $Runame [options] pattern [pattern ...]
Purpose: List files on CFS given a pattern or patterns
Options:
  --masterdir=runid ...look in "offical" dir on CFS for file names associated with runid
  runid=abc         ...look in "offical" dir on CFS for file names associated with runid
                       A runid=abc definition on the command line implies masterdir=on
  --others          ...look in "others" dir (this also implies masterdir=on)
  --user=acrnxxx    ...look in the acrnxxx user account for matching file names
  --short           ...look in user short term dir only (ignored if masterdir=on)
  --long            ...look in user long term dir only  (ignored if masterdir=on)
  --individual      ...look for individual files that do not belong to an arcfile
  --START=year:day  ...ignore files that are older that the specified date.
                       This is only valid in conjunction with the --individual option.
                       If year or day are positive integers then they will be used as
                       the actual year or day (e.g. --START=2010:1 means ignore files
                       that were written before Jan 1 of 2010).
                       If year or day are negative integers then they are relative to
                       the current year and day (e.g. --START=-1 means ignore files older
                       than 1 year).
                       The default year and day correspond to 10 days ago.
  --with_individual ...look for individual files in addition to the primary search.
                       A search will be done according to the user supplied criteria
                       but after that search is complete, the same search will be done
                       on individual files that do not belong to an arcfile. The --START
                       command line option may be used to determine the maximum age
                       of files found. This can add a significant amount of time to
                       the search and so should be used with caution.
  --all_match       ...match all patterns supplied on the command line (default, match any)
  --1               ...output only file names, one file name per line (this is the number 1)
  --verbose         ...increase verbosity (additive)
  --dump_arcdat     ...initialize cfs info and dump it to stdout, then exit
EOR
  if ($msg) {
    exit 1;
  } else {
    exit 0;
  }
};
$show_usage = 0;

# Determine the location from which this script was invoked
my $domain = (split(/\s+/, `grep search /etc/resolv.conf`))[1];
if ($domain) {
  $domain =~ s/^\s*int\.//i if $domain =~ /^\s*int\./;
  $location = (split /\./, $domain)[0];
} else {
  # chomp($host = `hostname`);
  if ($host) {
    $location = "Unknown";
    if ($host =~ /^joule/)  {$location = "cmc"};
    if ($host =~ /^pollux/) {$location = "cmc"};
    if ($host =~ /^erg/)    {$location = "cmc"};
    if ($host =~ /^alef/)   {$location = "cmc"};
    if ($host =~ /^ib/)     {$location = "cmc"};
    if ($host =~ /^c\d/)    {$location = "cmc"};
  } else {
    $location = "Unknown";
  }
}

# Process command line arguments
use Getopt::Long;
$Getopt::Long::ignorecase = 0;
$Getopt::Long::order = $PERMUTE;
&GetOptions("verbose"          => sub {$verbose++},
            "help!"            => \$show_usage,
            "prex!"            => \$prex,
            "1!"               => \$single_column_output,
            "all_match!"       => \$match_all,
            "quiet!"           => \$quiet,
            "dump_arcdat!"     => \$dump_arcdat,
            "masterdir=s"      => sub {$masterdir=1; push @NonOpt,'runid="'.$_[1].'"'},
            "runid=s"          => sub {$masterdir=1; push @NonOpt,'runid="'.$_[1].'"'},
            "global_master"    => sub {$all_master_dir=1; $masterdir=1; $others_dir=1},
            "env!"             => \$read_env,
            "CFSDATA!"         => \$CFSDATA,
            "short"            => sub {$short_term_dir=1; $long_term_dir=0},
            "long"             => sub {$short_term_dir=0; $long_term_dir=1},
            "others"           => sub {$others_dir=1; $masterdir=1},
            "individual"       => sub {$individual{check}=1; $individual{check_active}=1;
                                       $masterdir=0; $all_master_dir=0;
                                       $CFSDATA=0; $others_dir=0;
                                       $short_term_dir=0; $long_term_dir=0},
            "START=s"          => sub {$_[1] =~ s/\s+//g; my ($y,$d) = split(/:/,$_[1]);
                                       $individual{cutoff_year}=$y if $y;
                                       $individual{cutoff_day}=$d if $d},
            "with_individual!" => \$with_individual,
            "count_file=s"     => \$count_file,
            "user=s"           =>
               sub {
                $cfsuser=$_[1];
                die "The --user= command line option requires a value\n" unless $cfsuser;
               },
            "duplicates!"      => \$show_duplicates,
            "<>"               => sub {push @NonOpt,$_[0]})
    or &$Usage("Error on command line.");

&$Usage() if $show_usage;

# Process non option command line args
@pattern = ();
foreach (@NonOpt) {
  next unless $_;
  if (/=/) {
    # This is a variable assignment
    my ($var,$val) = /^\s*(.*?)=(.*)/;
    # Strip quotes from the value, if any
    $val =~ s/^\s*"(.*)"\s*$/$1/;
    $val =~ s/^\s*'(.*)'\s*$/$1/;
    die "No variable name supplied with command line definition --> $_ <--\n" unless $var;
    die "No value supplied with definition of $var on command line --> $_ <--\n" unless $val;
    # Add variable defs found on the command line to the env hash
    # or assign these values to specific variables
    if ($var eq "cfsuser") {
      $cfsuser = $val;
    } elsif ($var eq "masterdir") {
      if ($val =~ /(on|yes|y)/i) {
        $masterdir = 1;
      } else {
        $masterdir = 0;
      }
    } else {
      # Otherwise push onto env
      # Multiple variable assignments for the same variable will be added to a
      # list of values for that variable.
      # Each individual assignment will be split on whitespace and added
      # as a separate entry in the list of values.
      push @{$env{$var}}, split(/\s+/,$val);

      # When runid is set by the user assume they want to see files in the offical cfs dir
      if ($var eq "runid") {
        $masterdir = 1;
      }
    }
    next;
  }

  # Any remaining command line args should be patterns
  push @pattern, $_;
}

if ($verbose > 10) {
  foreach (sort keys %env) {
    print "$_ = ",join(",",@{$env{$_}}),"\n";
  }
  print "patterns: ",join(",",@pattern),"\n";
}

$verbose = -1 if $quiet;

# Read patterns from environment variables with names of the form file1, file2,...
if ($read_env) {
  my $found_file_def = 0;
  my $val=$ENV{file1};
  my $n = 1;
  # Stop reading when the first null value is found
  while ($val) {
    # Convert these file names to lower case to work around a "feature" of the
    # CCCma archive filename database wherein all file names are lower case
    $val = lc($val);
    # Make these patterns match the entire file name
    substr($val,0,0) = '^';
    $val .= '$';
    push @pattern, $val;
    $found_file_def = 1;
    $n++;
    my $vname = sprintf("file%d",$n);
    $val = $ENV{$vname}
  }
  unless ($found_file_def) {
    warn "Option --env was used but no file names are defined in the current environment.\n";
    die "At a minimum the variable file1 must be defined.\n"
  }
}

push @pattern,'*' unless scalar(@pattern);

if ($verbose > 10) {
  foreach (@pattern) {print "pattern: $_\n"}
  print "cfsuser=$cfsuser\n";
  print "masterdir=$masterdir\n";
  foreach (@{$env{runid}}) {print "runid: $_\n"}
}

# Preprocess each pattern to ensure it is a perl regex
foreach my $pat (@pattern) {
  if ($prex) {
    # The input pattern is assumed to be a perl regex
    push @allpat, $pat;
  } else {
    # The input pattern (possibly) contains shell wildcards
    # Replace the input pattern with a perl regex
    $ppat = $pat;
    # replace '[!' with '[^'
    $ppat =~ s/\Q[!/[^/g;
    # replace '*' with '.*'
    $ppat =~ s/\*/.*/g;
    # replace '?' with '.'
    $ppat =~ s/\?/./g;
    push @allpat, $ppat;
  }
}

# Process patterns ending with "_arc" separately
if (scalar(@allpat)) {
  my @new_allpat = ();
  $find_cwd = Cwd::cwd();
  foreach my $ppat (@allpat) {
    if ( $ppat =~ /_arc\s*$/i ) {
      # This is the name of an arcfile, list its contents
      undef $wanted;
      $wanted = sub {
          /^$ppat\z/s && do
          {
              chomp (my $arcfile = `basename $find_name`);
              doexec(0, 'echo',"arcfile: $arcfile") &&
              doexec(0, 'cat','{}') &&
              ($File::Find::prune = 1)
          }
      };
      File::Find::find({wanted => \&$wanted}, "$ENV{CCRNSRC}/arc_dir/active");
    } else {
      # This is not the name of an arcfile
      push @new_allpat, $ppat;
    }
  }
  @allpat = @new_allpat;

  # At this point @allpat contains only the patterns
  # that are not of the form "*_arc", if any
  unless ( scalar(@allpat) ) {
    # All user supplied patterns contained "_arc" as the last 4 chars
    if ( $env{runid} ) {
      # The user has supplied at least 1 runid on the command line
      # Add ".*" to the pattern list so that it will be as if the user
      # invoked lsarc as e.g. "lsarc runid=xyz" implying the * wild card
      push @allpat, '.*';
    }
  }

  if ( $wanted and scalar(@allpat) ) {
    # There were "*_arc" patterns (therefore arcfiles were listed) and the user
    # has requested more possible output. Print a blank line for separation.
    print "\n";
  }
}

# If @allpat is empty at this point then there were only "*_arc" patterns found
# Since these were all processed above, there is no need to continue
exit unless scalar @allpat;

# Initialize the cfs data base in memory
die "Unable to initialize CFS database.\n" unless init_cfsdb;

# When dump_arcdat is true we go no farther
exit 0 if $dump_arcdat;

# Number of days since Jan 1 0:0:0 to the start of each month
# excluding leap years
my @SOM = ( 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 );

$MON{M01} = "Jan";
$MON{M02} = "Feb";
$MON{M03} = "Mar";
$MON{M04} = "Apr";
$MON{M05} = "May";
$MON{M06} = "Jun";
$MON{M07} = "Jul";
$MON{M08} = "Aug";
$MON{M09} = "Sep";
$MON{M10} = "Oct";
$MON{M11} = "Nov";
$MON{M12} = "Dec";

if (scalar(@allpat)) {
  $total_hits = 0;
  my @indv_hits;
  RETRY: if ($match_all) {
    # Match all patterns supplied on the command line ("and" patterns)
    @hits = sort keys %CFSDB;
    @indv_hits = @indv_files;
    foreach my $ppat (@allpat) {
      # Grep through file names found on cfs
      if ($verbose > 0) {
        print "Searching for --> $ppat <--\n";
      }
      my @hitsx = grep /$ppat/,@hits;
      @hits = @hitsx;
      my @indv_hitsx = grep /$ppat/,@indv_hits;
      @indv_hits = @indv_hitsx;
    }
  } else {
    # Match any pattern found on the command line ("or" patterns)
    if (scalar(@allpat) > 1) {
      # Define a pattern that is the "or" of all input patterns
      $ppat = '(' . join('|',@allpat) . ')';
    } else {
      $ppat = $allpat[0];
    }
    # Grep through file names found on cfs
    if ($verbose > 0) {
      print "Searching for --> $ppat <--\n";
    }
    @hits = sort grep /$ppat/,keys %CFSDB;
    @indv_hits = grep /$ppat/,@indv_files;
  }

  if ($verbose > 0) {
    print "Found ",scalar(@hits)," matches\n";
  }
  $total_hits += scalar(@hits);
  foreach (@hits) {
    next if /^NA$/;
    my $year = substr($CFSDB{$_}{date},0,4);
    my $doy  = substr($CFSDB{$_}{date},4,3);
    my $hour = substr($CFSDB{$_}{date},7,2);
    my $min  = substr($CFSDB{$_}{date},8,2);
    my $sec  = substr($CFSDB{$_}{date},11,2);
    my $m = 0;
    for ($m=1; $m<12; $m++) {
      my $s1 = 1*$SOM[$m-1];
      my $s2 = 1*$SOM[$m];
      last if 1*$doy > $s1 and $doy <= $s2;
    }
    $m = 12 if $m == 0;
    my $day = 1*$doy - 1*$SOM[$m-1];
    my $mm = sprintf("M%2.2d",$m);
    my $mon = $MON{$mm};
    my $date = sprintf("%3s %2d %4d %2.2d:%2.2d:%2.2d",$mon,$day,$year,$hour,$min,$sec);
    if ( $single_column_output ) {
      # Print only the file name, one file name per line
      printf "%s\n",$_;
    } else {
      printf " %10s %20s %15s  %-50s %s\n",
        $CFSDB{$_}{owner},$date,$CFSDB{$_}{size},$_,$CFSDB{$_}{arcfile};
    }
  }

  if (scalar(@indv_hits)) {
    # List any hits for individual files found on cfs via ssh
    $total_hits += scalar(@indv_hits);
    if ( $single_column_output ) {
      foreach (@indv_hits) {
        my @tmp_list = split " ";
        print pop(@tmp_list),"\n";
      }
    } else {
      print join("\n",@indv_hits),"\n";
    }
  }

  if ($with_individual and not $individual{check}) {
    # Reinitialize CFSDB with info about individual files
    $individual{check}=1;
    $individual{check_active}=1;
    $masterdir=0;
    $all_master_dir=0;
    $CFSDATA=0;
    $others_dir=0;
    $short_term_dir=0;
    $long_term_dir=0;

    print "Reinitializing data base to find individual files.\n";
    init_cfsdb;
    goto RETRY;
  }

  if ($count_file) {
    open(COUNT, ">$count_file") or die "Unable to create $count_file\n";
    print COUNT $total_hits,"\n";
    close(COUNT);
  }
}

exit 0;
########################################################
##################### End of main ######################
########################################################

sub lslR_cfs_dir {
  # Get a listing of all individual files in and below a user supplied cfs directory
  # Individual files are files that do not belong to any "arc" file
  # These file lists (ls -l output lines) are added to the global @indv_files list
  use strict;
  my $cfsdir = shift;

  die "Missing cfsdir. " unless $cfsdir;

  # Silently ignore missing directories
  return 1 unless -d $cfsdir;

  my $shcmd = "ls -lR $cfsdir";
  chomp(my @sh_out = `$ROUTE_SSH ssh cfs2 \'$shcmd\' 2>&1`);
  if ( $? ) {
    foreach (@sh_out) {print "$_\n"}
    die "Problem executing $ROUTE_SSH ssh cfs2 $shcmd ";
  }
  foreach my $line ( @sh_out ) {
    # Ignore everything except lines containing a file long list
    next unless $line =~ /^-/;
    # Ignore all "arc" files
    next if $line =~ /_arc\s*$/;
    push @indv_files, $line;
  }
  return 1;
}

sub init_cfsdb {
  # Initialize the cfsdb hash using info from ~acrnsrc/arc_dir/active/
  use strict;
  use Sys::Hostname;

  undef %CFSDB;
  undef %DUPLICATE;

  # Add a single dummy entry to the database to avoid the
  # 'Unable to initialize CFS database' error if no files are found
  $CFSDB{NA}{date}       = "0001001000000";
  $CFSDB{NA}{arcfile}    = "NA";
  $CFSDB{NA}{size}       = "NA";
  $CFSDB{NA}{owner}      = "NA";
  $CFSDB{NA}{short_term} = "NA";

  # ndir is used to keep track of the number of directories that are read
  my $ndir = 0;

  if ($location =~ /^cmc$/) {
    # When location is "cmc" assume that the CFS database is available
    # on a local disk

    if ($individual{check}) {
      # Get cfs info from the subdir ~acrnsrc/arc_dir/SUM_DIR about files
      # that are archived individually

      my $read_arc = $individual{read_arc};

      # Always look in the NEW subdir which will contain just the entries
      # that have been added on the current day
      my $SEARCHD = "$ENV{CCRNSRC}/arc_dir/SUM_DIR/NEW";
      process_individual("$SEARCHD",$read_arc);

      if ($individual{check_active}) {
        # Look in the active subdir which will contain all entries after
        # they have been processed by a cron job that runs once a day
        my $SEARCHD = "$ENV{CCRNSRC}/arc_dir/SUM_DIR/active";

        # Get a list of all subdirs in this directory with names like yNNNN
        # where NNNN are integers (a year)
        opendir(CURRDIR, "$SEARCHD") || die "Unable to open dir $SEARCHD.\n  Stopped";
          my @ylist = sort grep { !/^\./ and /^\s*y\d+/ and -d "$SEARCHD/$_"} readdir CURRDIR;
        closedir(CURRDIR);

        # Define a cutoff year and cutoff day which will be used to determine the oldest
        # directory to search. The default will be 10 days prior to the current date.

        # Determine the current year and day of year
        my $curr_year = `date "+%Y"`;
        chomp $curr_year;
        my $curr_day = `date "+%j"`;
        chomp $curr_day;

        # Set the default cutoff year and day of year
        my $cutoff_year = $curr_year;
        my $cutoff_day  = $curr_day - 10;
        while ($cutoff_day <= 0) {
          $cutoff_day = 365 + $cutoff_day;
          $cutoff_year--;
        }

        # Adjust the value of the cutoff year according to user input
        if ($individual{cutoff_year}) {
          # The user has supplied a cutoff year
          if ($individual{cutoff_year} =~ /^\s*curr\s*$/) {
            # Use the current year
            $cutoff_year = `date "+%Y"`;
            chomp $cutoff_year;
          } elsif ($individual{cutoff_year}<0) {
            # Determine the year relative to the current year
            $cutoff_year = $curr_year + $individual{cutoff_year};
            $cutoff_day = $curr_day;
          } else {
            # Force to integer. This will result in 0 if not int.
            $cutoff_year = 1*$individual{cutoff_year};
          }
        }

        # Adjust the value of the cutoff day according to user input
        if ($individual{cutoff_day}) {
          # The user has supplied a cutoff day
          if ($individual{cutoff_day} =~ /^\s*curr\s*$/) {
            # Use the current day of the year
            $cutoff_day = `date "+%j"`;
            chomp $cutoff_day;
          } elsif ($individual{cutoff_day}<0) {
            # Determine the day of year relative to the current day
            $cutoff_day = $curr_day + $individual{cutoff_day};
            $cutoff_year = $curr_year unless defined $individual{cutoff_year};
            while ($cutoff_day <= 0) {
              $cutoff_day = 365 + $cutoff_day;
              $cutoff_year--;
            }
          } else {
            # Force to integer. This will result in 0 if not int.
            $cutoff_year = $curr_year unless $individual{cutoff_year};
            $cutoff_day = 1*$individual{cutoff_day};
          }
        }

        foreach my $ydir (@ylist) {
          next unless $ydir;
          my ($this_year) = $ydir =~ /(\d+)$/;
          next if ($this_year < $cutoff_year);
          # Determine the full pathname of the current year dir
          my $year_dir = "$SEARCHD/$ydir";
          # Get a list of all subdirs in this year directory with names like jdNNN
          # where NNN are integers (day of the year)
          opendir(YDIR, "$year_dir") || die "Unable to open dir $year_dir.\n  Stopped";
            my @dlist = sort grep { !/^\./ and /^\s*jd\d+/ and -d "$year_dir/$_"} readdir YDIR;
          closedir(YDIR);
          foreach my $ddir (@dlist) {
            next unless $ddir;
            my ($this_day) = $ddir =~ /(\d+)$/;
            next if ($this_year == $cutoff_year and $this_day <= $cutoff_day);
            process_individual("$year_dir/$ddir", $read_arc);
          }
        }

      }
    }

    if ($CFSDATA) {
      # Get cfs info from the file ~/info/CFSDATA, if it exists
      my $curr_owner = `whoami`;
      chomp $curr_owner;
      my $curr_arcdata = "$ENV{HOME}/info/CFSDATA";

      my  @h_arclist = ();
      if ( -s "$curr_arcdata" ) {
        # Read the arclist file if it is present and not empty
        my @xh_arclist = ();
        open (ARCFILE, "<$curr_arcdata") || return undef;
          push @xh_arclist, <ARCFILE>;
        close (ARCFILE);
        @h_arclist = grep !/^\s*\.cfs/, @xh_arclist;
      }

      # Process the current arclist file adding entries to %CFSDB
      process_arclist($curr_owner, "0", @h_arclist);
    }

    unless ($masterdir or $CFSDATA) {
      # Copy cfs info from the users cfs directory in the cfs database

      # Create a 3 letter tag (the last 3 letters of the invoking users account name)
      # to be used to find the user specific arclist in ~acrnsrc/arc_dir/active/
      my $curr_owner = $cfsuser;
      (my $USR) = $curr_owner =~ /(...)$/;

      if ($long_term_dir) {
        # Read data from the users long term directory
        my $curr_dir = "$ENV{CCRNSRC}/arc_dir/active/ccrd_user_archive/$USR";

        # Append any individual files to @indv_files
        lslR_cfs_dir("/home/cfs_ccrd/ccrd_user_archive/$USR");

        # Read the arclist or *_arc files from the users long term dir
        # and return their contents as a list
        my  @ul_arclist = read_arclist("$curr_dir");

        # Process the current arclist file adding entries to %CFSDB
        process_arclist($curr_owner, "0", @ul_arclist);
      }

      if ($short_term_dir) {
        # Read data from the users short term directory
        my $curr_dir = "$ENV{CCRNSRC}/arc_dir/active/ccrd_short_term_archive/$USR";

        # Append any individual files to @indv_files
        lslR_cfs_dir("/home/cfs_ccrd/ccrd_short_term_archive/$USR");

        # Read the arclist or *_arc files from the users short term dir
        # and return their contents as a list
        my  @us_arclist = read_arclist("$curr_dir");

        # Process the current arclist file adding entries to %CFSDB
        process_arclist($curr_owner, "1", @us_arclist);
      }

      my $cnet_dir = 1;
      if ($cnet_dir and $long_term_dir) {
        # Read data from the cnet (external) users long term directory
        my $curr_dir = "$ENV{CCRNSRC}/arc_dir/active/cnet/$USR";

        # Append any individual files to @indv_files
        lslR_cfs_dir("/home/cfs_ccrd/cnet/$USR");

        # Read the arclist or *_arc files from the users short term dir
        # and return their contents as a list
        my  @el_arclist = read_arclist("$curr_dir");

        # Process the current arclist file adding entries to %CFSDB
        process_arclist($curr_owner, "1", @el_arclist);
      }
    }

    if ($masterdir and not $CFSDATA) {
      # Copy cfs info from the official runs database
      # OFD is the root of the official data dir in the cfs database
      my $OFD = "$ENV{CCRNSRC}/arc_dir/active/ccrn/offcl_data";

      # The "others" dir is nonconforming so we process it separately
      my $read_others = 0;
      # read the others dir when no runids are supplied by the user
      $read_others = 1 unless $env{runid};
      # Override everything with the value of others_dir
      $read_others = $others_dir;
      if ($read_others) {
        # Get all individual files from the others dir
        # Append any individual files to @indv_files
        lslR_cfs_dir("/home/cfs_ccrd/ccrn/offcl_data/others");
      }
      if ( -d "$OFD/others" and $read_others) {
        # Read the arclist from the "others" dir
        my  @o_arclist = read_arclist("$OFD/others");

        # Process the current arclist file adding entries to %CFSDB
        process_arclist("acrnsrc", "0", @o_arclist);
        $ndir++;
      }

      foreach my $dira (qw(a c d f m o r)) {
        next unless -d "$OFD/$dira";
        my @dlist = ();
        if ($env{runid} and not $all_master_dir) {
          # A list of runids was supplied on the command line
          # Subdirs are named for the first "_" separated field in runid
          @dlist = map {(split("_"))[0]} @{$env{runid}};
          # Remove any "-001" type suffix from the runid directory name
          # All runids of the form "xyz-001" are found in the "xyz" subdir
          foreach (@dlist) {s/-\d+$//};
        } else {
          # If $others_dir is true then the user has invoked --others but not
          # supplied any runids on the command line. In this case search *only*
          # the others dir
          next if ($others_dir and not $all_master_dir);

          # Get a list of all sub directories, whose names should be runids
          opendir(RUNDIR, "$OFD/$dira") || next;
            # Ignore all non directory entries
            @dlist = sort grep { !/^\./ and -d "$OFD/$dira/$_"} readdir RUNDIR;
          closedir(RUNDIR);
        }
        foreach my $runid_dir (@dlist) {
          next unless $runid_dir;
          next unless -d "$OFD/$dira/$runid_dir";

          # Append any individual files to @indv_files
          lslR_cfs_dir("/home/cfs_ccrd/ccrn/offcl_data/$dira/$runid_dir");

          foreach my $dirb (qw(d m p s)) {
            my $curr_dir = "$OFD/$dira/$runid_dir/$dirb";
            next unless -d "$curr_dir";

            # Read the arclist or *_arc files from the current dir
            # and return their contents as a list
            my  @o_arclist = read_arclist("$curr_dir");

            # Process the current arclist file adding entries to %CFSDB
            process_arclist("acrnsrc", "0", @o_arclist);
            $ndir++;
            if ($verbose > 0) {
              print "." if $ndir % 100 == 0;
            }
	  }
	}
      }
      if ($verbose > 0) {
        print "\n" if $ndir > 100;
      }
    }

  } else {
    # The CFS database is not on a local disk

    # ssh to joule and dump the database using lsarc
    my $Opts = "";
    $Opts .= " masterdir=on" if $masterdir;
    $Opts .= " cfsuser=$cfsuser" unless $masterdir;
    $Opts .= " --individual" if $individual{check};
    unless ( $short_term_dir and $long_term_dir ) {
      # These should not be set if both short and long term dirs are to be searched
      # since doing so will actually set only set one or the other (ie the last one
      # that is defined on the lsarc command line).
      # Note that searching both short and long term is the default behaviour
      $Opts .= " --short" if $short_term_dir;
      $Opts .= " --long"  if $long_term_dir;
    }
    $Opts .= " --CFSDATA"  if $CFSDATA;
    if ($env{runid} and scalar(@{$env{runid}})) {
      $Opts .= ' runid="'.join(' ',@{$env{runid}}).'"'
    }
    my @cfsdb = `ssh joule \'lsarc --dump_arcdat --quiet $Opts\' 2>&1`;
    foreach (@cfsdb) {
      next unless $_;
      # Assign variables from the current line
      my ($fname, $arcname, $owner, $fdate, $fsize, $short_term) = split /\s+/;
      $CFSDB{$fname}{date}       = $fdate;
      $CFSDB{$fname}{size}       = $fsize;
      $CFSDB{$fname}{arcfile}    = $arcname;
      $CFSDB{$fname}{owner}      = $owner;
      $CFSDB{$fname}{short_term} = $short_term;
      if ($verbose > 10) {
        print "fname=$fname  arcname=$arcname  owner=$owner";
        print "  fdate=$fdate  fsize=$fsize  short_term=$short_term\n";
      }
    }
  }

  unless ($dump_arcdat) {
    if ($verbose > 0 and $show_duplicates) {
      foreach (sort keys %DUPLICATE) {
        print "Duplicate entries found for $_ in arcfiles:\n";
        my $m = scalar(@{$DUPLICATE{$_}{arcfile}});
        my $n = 0;
        for ( $n = 0; $n < $m; $n++ ) {
          printf " %10s %15s %15s %s\n",
              $DUPLICATE{$_}{owner}   -> [$n],
              $DUPLICATE{$_}{date}    -> [$n],
              $DUPLICATE{$_}{size}    -> [$n],
              $DUPLICATE{$_}{arcfile} -> [$n];
        }
      }
      if ( scalar(keys %DUPLICATE) ) {
        print "Found ",scalar(keys %DUPLICATE)," duplicate entries in CFS database.\n";
      }
    }

    if ($verbose > 1) {
      print "Created ",scalar(keys %CFSDB)," entries in CFS database.\n";
    }
  }

  if (%CFSDB) {
    return 1;
  } else {
    return undef;
  }
}

sub is_duplicate {
  use strict;
  my $curr_file = shift;
  my $curr_arcfile = shift;
  my $curr_owner = shift;
  my $curr_date = shift;
  my $curr_size = shift;
  if (exists $CFSDB{$curr_file}) {
    if ( exists $DUPLICATE{$curr_file} ) {
      # If this duplicate has been seen before simply append the
      # current arcfile name and owner to the list
      push @{$DUPLICATE{$curr_file}{arcfile}}, $curr_arcfile;
      push @{$DUPLICATE{$curr_file}{owner}}, $curr_owner;
      push @{$DUPLICATE{$curr_file}{date}}, $curr_date;
      push @{$DUPLICATE{$curr_file}{size}}, $curr_size;
    } else {
      # Initialize the list with both the current arcfile name/owner
      # and the previous arcfile name and owner
      push @{$DUPLICATE{$curr_file}{arcfile}},$CFSDB{$curr_file}{arcfile},$curr_arcfile;
      push @{$DUPLICATE{$curr_file}{owner}},$CFSDB{$curr_file}{owner},$curr_owner;
      push @{$DUPLICATE{$curr_file}{date}},$CFSDB{$curr_file}{date},$curr_date;
      push @{$DUPLICATE{$curr_file}{size}},$CFSDB{$curr_file}{size},$curr_size;
    }
    # Set a flag indicating that this file name occurs multiple times
    # When this is true, the entry in CFSDB is the last one found.
    $CFSDB{$curr_file}{duplicate} = 1;
    return 1;
  } else {
    return 0;
  }
}

sub process_individual {
  # Update the CFSDB hash using a list of file names read from a subdirectory
  # of ~acrnsrc/arc_dir/SUM_DIR. Each file read will contain information on a
  # single file, which is either an arcfile saved on cfs or an individual file
  # that is not part of an arcfile but is saved on cfs.
  use strict;
  my $curr_dir = shift;
  my $read_arc = shift;

  return undef unless -d "$curr_dir";

  if ($verbose > 10) {
    print "Processing directory $curr_dir\n";
  }

  # Get a list of all regular files in this directory that end with "_sum"
  opendir(CURRDIR, "$curr_dir") || die "Unable to open dir $curr_dir.\n  Stopped";
    my @flist = sort grep { !/^\./ and /_sum\s*$/ and -f "$curr_dir/$_"} readdir CURRDIR;
  closedir(CURRDIR);

  foreach (@flist) {

    # Ignore any file name that end with "_arc_sum" unless
    # these files are requested via $read_arc
    next if (/_arc_sum\s*$/ and not $read_arc);

    # Each file should contain a single line containing the following
    # whitespace separated fields:  checksum  cfs_pathname  encoded_date  owner
    my $file_contents = `cat $curr_dir/$_`;
    my @line = split /\s+/, $file_contents;

    if ($verbose > 10) {
      print "$_   ",join(" ",@line),"\n";
    }

    # Use the last '/' spearated field in the pathname as the file name
    my ($fname)    = (split /\//,$line[1])[-1];
    my $fdate      = $line[2];
    my $owner      = $line[3];
    my $arcname    = "NA";
    my $fsize      = "NA";
    my $short_term = 0;

    # If duplicate file names exists then update %DUPLICATE
    my $found_duplicate = is_duplicate($fname, $arcname, $owner, $fdate, $fsize);

    # Update %CFSDB
    $CFSDB{$fname}{date}       = $fdate;
    $CFSDB{$fname}{arcfile}    = $arcname;
    $CFSDB{$fname}{size}       = $fsize;
    $CFSDB{$fname}{owner}      = $owner;
    $CFSDB{$fname}{short_term} = $short_term;
    if ($dump_arcdat) {
      print "$fname $arcname $owner $fdate $fsize $short_term\n";
    }
  }
  return 1;
}

sub process_arclist {
  # Update the CFSDB hash using a list of lines read from an arclist file
  # or a number of *_arc files
  use strict;
  my $owner = shift;
  my $short_term = shift;
  foreach (@_) {
    # The input list will contain lines from an arclist file in the format:
    # file_name  date  arcfile_name  file_number  version  size  version  checksum
    my @line = split /\s+/;

    # The file name is the first whitespace separated word on each line
    my $fname   = $line[0];
    my $fdate   = $line[1];
    my $arcname = $line[2];
    my $fsize   = $line[5];

    # If duplicate file names exists then update %DUPLICATE
    my $found_duplicate = is_duplicate($fname, $arcname, $owner, $fdate, $fsize);

    if ( 1 == 1 ) {
      my $match_found=1;
      if ($match_all) {
        # Match all patterns supplied on the command line ("and" patterns)
        $match_found=1;
        foreach my $ppat (@allpat) {
          $match_found = 0 unless $fname =~ /$ppat/;
        }
      } else {
        # Match any pattern found on the command line ("or" all patterns)
        my  $ppat = $allpat[0];
        if (scalar(@allpat) > 1) {
          # Define a pattern that is the "or" of all input patterns
          $ppat = '(' . join('|',@allpat) . ')';
        }
        $match_found = 0 unless $fname =~ /$ppat/;
        if ($match_found and $verbose > 10) {
          print "file=$fname  ppat=$ppat  match_found=$match_found\n";
        }
      }
      next unless $match_found;
    }

    # Update %CFSDB
    $CFSDB{$fname}{date}       = $fdate;
    $CFSDB{$fname}{arcfile}    = $arcname;
    $CFSDB{$fname}{size}       = $fsize;
    $CFSDB{$fname}{owner}      = $owner;
    $CFSDB{$fname}{short_term} = $short_term;
    if ($dump_arcdat) {
      print "$fname $arcname $owner $fdate $fsize $short_term\n";
    }
  }
  return 1;
}

sub read_arclist {
  # Read a file named "arclist" in $curr_dir or, if arclist is not present
  # then read all files name *_arc and put the contents into the returned list
  use strict;
  my $curr_dir = shift;

  my @arclist = ();
  if ( -s "$curr_dir/arclist" ) {
    # Read the arclist file if it is present and not empty
    open (ARCFILE, "<$curr_dir/arclist") || return ();
      push @arclist, <ARCFILE>;
    close (ARCFILE);
    # @arclist = `cat $curr_dir/arclist 2>&1`;

  } else {
    # Read every *_arc file in this dir

    opendir(CURRDIR, "$curr_dir") || return ();
      my @arcfiles = sort grep { !/^\./ and /_arc$/ } readdir CURRDIR;
    closedir(CURRDIR);

    foreach (@arcfiles) {
      next unless $_;
      open (ARCFILE, "<$curr_dir/$_") || next;
        push @arclist, <ARCFILE>;
      close (ARCFILE);
    }
  }

  return @arclist;
}

sub doexec ($@) {
    # Execute a command 
    my $ok = shift;
    my @command = @_; # copy so we don't try to s/// aliases to constants
    for my $word (@command)
        { $word =~ s#{}#$find_name#g }
    if ($ok) {
        my $old = select(STDOUT);
        $| = 1;
        print "@command";
        select($old);
        return 0 unless <STDIN> =~ /^y/;
    }
    chdir $find_cwd;
    system @command;
    chdir $File::Find::dir;
    return !$?;
}