#!/usr/bin/python
"""
hpcwrap

A basic python wrapper for hpcarchive, allowing users to list and pull data on tape. Note that this script has not been written for
speed/optimization but for user ease and it makes quite a few assumptions about the archive structure. If speed, or general pulling
becomes a main goal, this should be altered.

Author: Clint Seinen

Date: 2018-01-24
"""
import argparse, os, json, re, sys
import string
import subprocess as sp
from datetime import datetime as dt

def query_hpcarch(runid, syear, eyear, ftypes, action, tar_dir, verbose, silent, astrc):
    """
    query_hpcarch(runid, syear, eyear, ftypes, action, verbose, silent, astrc)

    queries hpcarchive for the given runid/filetypes with model times falling between the desired range, and depending on the value of 'action' either creates/submits jobscripts that retrieves the data from tape or simply lists the files matching the query. 

    Inputs:
    -------
        runid   : string
            - runid being considered ('*' globbing supported)
        syear   : int
            - beginning of desired record. If strings are given for syear and eyear, query_hpcarch will consider entire date range seen for the resulting files.
        eyear   : int
            - end of desired record
        ftypes  : list of strings
            - list of filetypes to look for
        action  : string
            - action to perform, currently support "retrieve" or "list"
        tar_dir : string
            - if a non-empty string is given, places the files here, else it creates a directory in $RUNPATH with an 'tp_RUNID_' suffix and places the data there.
        verbose : boolean
            - if true, the function outputs creates added verbosity in error messages
        silent  : boolean
            - if true, all extra output is surpressed
        astrc   : boolean
            - if true query_hpcarch lists/pull all files, even duplicates in different archives. Additionally, when pulling data, this will pull the data into individual directories within the target directory for each archive, where the names are the actual archive names. If false, only the most recent files will be considered and no subdirectories will be created.
    """

    # enforce equal type for start and end years
    if not type(syear) == type(eyear): 
        print "variables eyear and syear must be the same type..aborting"
        sys.exit(3)

    if not silent: print "Checking tape (hpcarchive) for files associated with runids matching: " + runid
    
    # change shell wild cards to equivalent regex command
    runid = runid.replace('*', '.*')
    if ftypes: ftypes = [ ft.replace('*', '.*') for ft in ftypes ]

    # query the database
    #-- try/except implemented to handle errors from hpcarchive
    cmd = "hpcarchive -ljxU -p crd_cccma -c '^"+runid+"_'"
    try:
        hpc_op = sp.check_output(cmd, shell = True)
    except sp.CalledProcessError as e:
        if not silent: print "Sorry, unexpected error with hpcarchive...please try again"
        if verbose:
            print "Error output:\n================================\n"
            print e.output
        sys.exit(2)
    
    # parse raw json output
    try:
        data = json.loads(hpc_op)
    except ValueError as e:
        if not silent: print "Sorry, unexected value error occured while trying to parse the json data."
        if not silent: print "The problem is likely associated with hpcarchive output....please run your command again."
        if verbose:
            print "VERBOSE: raw hpcarchive output:\n==============================\n"
            print hpc_op
            print "\nVERBOSE: Erorr output:\n=======================================\n"
            print e.output
        sys.exit(2)
   
    # check that any archives were found
    if not data['results']:
        if not silent: print "\nSorry, no archives found for " + runid
        if not silent: print "Please review your query and try again"
        sys.exit(1)

    # build list of archives and filenames
    #-- Note: the runfiles_archtime is used to compare archive times, so we can 
    #-- select the most recently archived file if duplicate filenames are found.
    runfiles            = []
    runfiles_arch       = []
    runfiles_archtime   = []
    for d in data['results']:
        filename = d['file']['filename'] 

        if filename in runfiles and not astrc:
            #-- multiple files with the same file name, check the archive time and replace with new file
            indx        = runfiles.index(filename)
            archtime    = dt.strptime(d['file']['archive_time'], '%Y-%m-%dT%H:%M:%S.%f')
            
            if archtime > runfiles_archtime[indx]:
                #---- this file was archived more recently, overwrite
                runfiles_arch[indx]     = d['archivename']
                runfiles_archtime[indx] = archtime
        else:
            #-- store information
            runfiles.append(filename)                        
            runfiles_arch.append(d['archivename'])
            runfiles_archtime.append(dt.strptime(d['file']['archive_time'], '%Y-%m-%dT%H:%M:%S.%f'))

    #-- get list of archives
    archives = list(set(runfiles_arch))
    
    # parse out start and end dates from archive names 
    strt_dts = [dt.strptime(i.split('_')[1], '%Y%m%d') for i in archives]
    end_dts  = [dt.strptime(i.split('_')[2], '%Y%m%d') for i in archives]
        
    # sort dates and associated archives 
    strt_dts, archives = (list(tmp) for tmp in zip(*sorted(zip(strt_dts, archives))))
    end_dts.sort()
    
    # produce list of archives in the specified date range
    rng_archs       = []
    rng_archs_strt  = []
    rng_archs_end   = []
    if type(syear) == int:
        dt_startY   = dt(syear, 1, 1)
        dt_endY     = dt(eyear, 12, 31)
    else:
        dt_startY   = min(strt_dts)
        dt_endY     = max(end_dts)

    #-- check dates against desired date range
    for ind, (strt, end) in enumerate(zip(strt_dts, end_dts)):
        if (dt_startY <= strt <= dt_endY) or (dt_startY <= end <= dt_endY):
            # archive contains data in desired range
            rng_archs.append(archives[ind]) 
            rng_archs_strt.append(strt)
            rng_archs_end.append(end)
        elif (strt <= dt_startY) and (dt_endY <= end):
            # range is completely contained with-in one archive..add to list
            rng_archs.append(archives[ind])
            rng_archs_strt.append(strt)
            rng_archs_end.append(end)

    #-- check that any archives were ID'd
    if not rng_archs:
        min_strt = min(strt_dts)
        max_end  = max(end_dts)
        if not silent: print "No archives located for runid '"+runid+"' for the desired dates.. please review your query"
        if not silent: print "Archives available for simulation times from years "+str(min_strt.year) + \
                                " to "+str(max(end_dts).year)
        sys.exit(1)

    #-- if -ft flag given, check that the desired filetypes are present in each archive
    if ftypes:
        for arch in rng_archs:
            #---- get list of files in each the archive
            fls = [x for (x,y) in zip(runfiles, runfiles_arch) if y == arch]

            #---- check that filetypes were given
            for ft in ftypes:
                if not any(re.match(r'.*(?i)'+ft+'\..*', f) for f in fls):
                    if not silent: print "Sorry, filetype "+ft.replace('.*','*')+" is not available..please review your query"
                    if verbose: print "VERBOSE: Flagged when looking in archive "+arch
                    sys.exit(1)

    # check for multiple runids so they can be handled separately
    runids = []
    for arch in rng_archs:
        runids.append(arch.split('_')[0])
    unique_runids = list(set(runids))

    # depending on the flags given, list or pull files off tape
    #============
    # LIST
    #============
    if action == 'list':
        
        for u_runid in unique_runids:
            if not silent: print "\nFiles for runid: " + u_runid
            if not silent: print "Archive Date\t\tFilename"
            for arch in [a for a,r in zip(rng_archs, runids) if r == u_runid]:
                #-- list specific filetypes if -ft was used
                if ftypes:
                    for ft in ftypes:

                        #---- If silent is turned off, the print archive time as well as filename
                        if not silent:
                            print '\n'.join(dt.strftime(z, '%Y-%m-%d %H:%M:%S') + "\t" + os.path.basename(x) \
                                for (x,y,z) in zip(runfiles, runfiles_arch, runfiles_archtime) if y == arch and re.match(r'.*(?i)'+ft+'\..*',x))
                        #---- Else just print filename
                        else:
                            print '\n'.join(os.path.basename(x) \
                                for (x,y) in zip(runfiles, runfiles_arch) if y == arch and re.match(r'.*(?i)'+ft+'\..*',x))

                #-- Else, print all files associated with this archive
                else:
                    #---- print archive time and filename
                    if not silent:
                        print '\n'.join(dt.strftime(z, '%Y-%m-%d %H:%M:%S') + "\t" + os.path.basename(x) \
                            for (x,y,z) in zip(runfiles, runfiles_arch, runfiles_archtime) if y == arch)
                    #---- print filename
                    else:
                        print '\n'.join(os.path.basename(x) \
                            for (x,y) in zip(runfiles, runfiles_arch) if y == arch)
                    
        if not silent: print "\n If you wish to retrieve these files please redo your command with '-r' instead of '-l'"
        sys.exit(0)

    #==========
    # RETRIEVE
    #==========
    if action == 'retrieve':
        if not silent: print "Files available, creating batch jobs to pull them off tape"

        # figure out what machine to submit to
        mach = sp.check_output("hostname", shell = True)
        if "ppp1" in mach or mach.startswith("xc1"):
            mach = "ppp1"
        elif "ppp2" in mach or mach.startswith("xc2"):
            mach = "ppp2"
        elif "ppp3" in mach or mach.startswith("xc3"):
            mach = "ppp3"
        elif "ppp4" in mach or mach.startswith("xc4"):
            mach = "ppp4"
        else:
            mach = "ppp1" # defaults to ppp1

        # if filetypes have been given, create regex command for hpcarchive
        if ftypes:
            ft_reg  = "|".join(ftypes)
        
        # loop through all runids found and create job script then submit
        for u_runid in unique_runids:

            #-- set resources for job script
            job_head =  ("#!/bin/bash\n"+
                        "#PBS -N hpcareq_"+u_runid+"\n"+
                        "#PBS -l walltime=1:00:00\n"+
                        "#PBS -o /dev/null\n"+
                        "#PBS -e /dev/null\n"+
                        "#PBS -q xxfer\n"+
                        "#PBS -l select=1:ncpus=1:mem=2G:res_image=ppp_eccc_all_default_ubuntu-14.04-amd64_latest\n"+
                        "#PBS -l place=free\n"+
                        "\n. /home/scrd101/generic/sc_cccma_setup_profile\n")

            #-- create list of archives to pull files from for this runid
            arch_list   = [a for a,r in zip(rng_archs, runids) if r == u_runid]
            arch_list   = " ".join(arch_list)

            #-- set target directory name 
            if not tar_dir:
                if type(syear) == int: # a date range was given, assign name to reflect it 
                    start_str   = '%04d' % syear
                    end_str     = '%04d' % eyear
                    tp_dir      = os.path.join("$RUNPATH", "tp_" + u_runid + "_" + start_str + "_" + end_str)
                else: # no dates were given, assign name by max/min dates for that runid
                    start_str   = '%04d' % min([s.year for s,r in zip(rng_archs_strt, runids) if r == u_runid])
                    end_str     = '%04d' % max([e.year for e,r in zip(rng_archs_end, runids) if r == u_runid])
                    tp_dir      = os.path.join("$RUNPATH", "tp_" + u_runid + "_" + start_str + "_" + end_str)
            else:
                tp_dir      = tar_dir

            #-- building the hpc command according to the flags given
            if astrc:
                # place data separate dir for each archive
                hpc_cmd_p1 =   ("\tmkdir -p" + os.path.join(tp_dir, "$a") + "\n" +
                                "\thpcarchive -r " + os.path.join(tp_dir, "$a") + " -Ubx -p crd_cccma -c \"$a\"")
            else:
                # place files in flat directory
                hpc_cmd_p1 = "\thpcarchive -r " + tp_dir + " -Ubx -p crd_cccma -c \"$a\""
          
            #-- attach command for specific filetypes if needed
            if not ftypes:
                hpc_cmd = hpc_cmd_p1 + "\n"
            else:
                hpc_cmd = hpc_cmd_p1 + " -f \"" + u_runid + "_.*(?i)(" + ft_reg + ")\..*\"\n"

            #-- write commands to file
            job_filename = 'hpcareq_'+u_runid+'.pbs'
            job_file = open(job_filename,'w')
            job_file.write(job_head)
            job_file.write("mkdir -p " + tp_dir + "\n")
            job_file.write("arch_list='" + arch_list + "'\n")
            job_file.write("for a in $arch_list; do\n")
            job_file.write(hpc_cmd)
            job_file.write("done\n")
            job_file.close()

            #-- SUBMIT
            try:
                op = sp.check_output("jobsub -c " + mach + " " + job_filename, shell = True)
                if not silent: print "retrieval job for runid " + u_runid + " submitted. Data will be placed in " + tp_dir
                if not silent: print "job id: " + op
                os.remove(job_filename)
            except sp.CalledProcessError as e:
                print "Sorry, something went wrong in the job submission process"
                print "Please try again"
                sys.exit(4)

    sys.exit(0)

if __name__=="__main__":

    #==============================
    # Parse command line arguments 
    #==============================
    descript = '''
    List/pulls data on hpcarchive for the given runid and time range specified 
    by the user. 
    
    default behaviour:
        - list/retrieve the most up to date versions of files; this means that if there 
            are duplicate files in separate archives, only the most recent version will be 
            listed or pulled. See the '-astrc' flag if you wish to change this behaviour.
        - pulls data onto the user's $RUNPATH, for whichever machine the retrieval job was
            submitted to, creating a subdir titled tp_RUNID_SYEAR_EYEAR, where SYEAR
            AND EYEAR represent the simulation record considered. Use the '-t' flag to change this. 
                *Note* If no start and end year are given, hpcwrap will choose SYEAR and EYEAR
                    such that they are the min/max simulation years found.
                *Note* If hpcwrap is called from hall1, then ppp1 is used, while calling it 
                    from hall2 results in the use of ppp2. If called from the landing pad, ppp1 
                    is used. 
    '''

    usg_ex = '''usage example:

        hpcwrap -l abc
            lists all files and their respective archive times for run abc.

        hpcwrap -l abc -d 22
            lists all files and their respective archive times for run abc,
            for year 22.

        hpcwrap -ls abc -d 10,15        
            surpressing all output except the final list, list all files for run abc 
            for sim years 10-15.
        
        hpcwrap -l ab* -d 10,15 -ft grid_T
            lists grid_T files and their archive times for runids matching
            'ab*', for sim. years 10-15.

        hpcwrap -r abc -ft grid_t
            pull all grid_t files for run abc, placing them inside 
            $RUNPATH/tp_abc_maxs_mine/, where 'maxs' and 'mine' represent the earliest 
            and latest simulation years available, respectively.

        hpcwrap -r abc -d 10,15 -ft grid_t,restart
            pull grid_t and restart files for run abc and sim years 10-15,
            placing them inside $RUNPATH/tp_abc_10_15/.

        hpcwrap -r abc -d 10,15 -ft grid*,res*
            pull all filetypes matching 'grid*' and 'res*' for abc's sim years 10-15,
            placing them inside $RUNPATH/tp_abc_10_15/.

        hpcwrap -r ab* -d 10,15 -t /my/target/dir
            pull all files for runs matching 'ab*' for sim years 10-15, storing the
            files in /my/target/dir

        hpcwrap -rs abc -d 100,105 -astrc
            retrieve all files associated with run abc for sim. years 100-105, placing 
            them in $RUNPATH/tp_abc_100_105/, supressing all output and creating separate 
            dirs for each archive accessed
    '''

    parser = argparse.ArgumentParser(prog = 'hpcwrap',
                                     description = descript,
                                     epilog = usg_ex,
                                     formatter_class = argparse.RawDescriptionHelpFormatter)

    # Required Vars
    parser.add_argument("RUNID",    action = "store",  help = "runid. Note: this argument supports '*' globbing.")

    # Options
    #-- Mutually exclusive options, one is required
    group = parser.add_mutually_exclusive_group(required = True)
    group.add_argument("-r",    action  = "store_true",     help = "Retrieve files, creating a RUNID_START_END/ directory in $RUNPATH.")
    group.add_argument("-l",    action  = "store_true",     help = "List files")

    #-- truely optional
    parser.add_argument("-ft",      metavar = "filetypes",      help = "Limit the search to the given filetypes in this COMMA DELIMITED LIST (do not insert any spaces, i.e grid_t,icemod,restart). Note that wildcard globbing is supported.")
    parser.add_argument("-d",       metavar = "dates",          help = "Limit the search for files in the date range specified in this comma delimited list (ie 5,10)")
    parser.add_argument("-t",       metavar = "target dir",     help = "Place retrieved files in this directory. Has no effect when listing files")
    parser.add_argument("-astrc",   action  = "store_true",     help = "Maintain archive structure. When retrieving, this will produce sub-directories for each archive under tp_RUNID_START_END/ and causes hpcwrap to list/pull files even if there are duplicates across different archives")
    parser.add_argument("-v",       action  = "store_true",     help = "Turn on verbosity")
    parser.add_argument("-s",       action  = "store_true",     help = "Turn off all extra output. This will suppress archive times.")

    # Parse the arguments
    args = parser.parse_args()

    # Set arguments/options
    runid   = args.RUNID
    ftypes  = args.ft
    dates   = args.d
    tar_dir = args.t
    rtrv    = args.r
    llst    = args.l
    astrc   = args.astrc
    verbose = args.v
    silent  = args.s

    # complain if the user activate the silent and verbose switch
    if silent and verbose:
        print "Sorry, you have turned on verbosity and silence, please only use one of the -v or -s flags"
        sys.exit(3)

    # separate filetype list
    if ftypes:
        ftypes = ftypes.split(',')
    else:
        ftypes = []

    # separate date list
    if dates:
        dates = dates.split(',')
        for i,d in enumerate(dates):
            dates[i] = int(d)

        # check how many dates were specified
        if len(dates) > 2:
            print "You can only specify a single date range...this requires only two numbers. Review your query and try again."
            sys.exit(3)
        elif len(dates) == 1:
            if not silent: print "Only one date given..looking for data for that year"
            syear = dates[0]
            eyear = dates[0]
        else:
            syear = dates[0]
            eyear = dates[1]

            # enforce order of dates
            if syear > eyear: 
                print "The specified start year must be earliear than the end year. Review your query and try again."
                sys.exit(3)
    else:
        syear = ''
        eyear = ''

    # set target dir
    if tar_dir and tar_dir.startswith('.'):
        print "Please use absolute paths when specifying the target directory."
        sys.exit(3)

    # determine action, and query the database
    if rtrv:
        query_hpcarch(runid, syear, eyear, ftypes, 'retrieve', tar_dir, verbose, silent, astrc)
    if llst:
        query_hpcarch(runid, syear, eyear, ftypes, 'list', tar_dir, verbose, silent, astrc)