RunJobArgo.py

# Class definition:
#   RunJobMira
#   [Add description here]
#   Instances are generated with RunJobFactory via pUtil::getRunJob()
#   Implemented as a singleton class
#   http://stackoverflow.com/questions/42558/python-and-the-singleton-pattern

# Import relevant python/pilot modules
from RunJobHPC import RunJobHPC                  # Parent RunJobHPC class
import Site, pUtil, Job, Node, RunJobUtilities
from pUtil import tolog, isAnalysisJob, readpar, getExperiment
from FileStateClient import updateFileStates, dumpFileStates
from ErrorDiagnosis import ErrorDiagnosis # import here to avoid issues seen at BU with missing module
from PilotErrors import PilotErrors
from datetime import datetime
from MessageInterface import MessageInterface
from ArgoJob import ArgoJob, ArgoJobStatus
from BalsamJob import BalsamJob 
from SiteInformation import SiteInformation

# Standard python modules
import os, sys, commands, time, optparse, shlex, stat 
import traceback
import atexit, signal


class RunJobArgo(RunJobHPC):

    # private data members
    __runjob = "RunJobArgo"                      # String defining the sub class
    __instance = None                            # Boolean used by subclasses to become a Singleton
    __error = PilotErrors()                    # PilotErrors object
    # public data members
    process = ""# zjet, wjet, wqq, wcjet, etc.
    base_filename = "alpout" # should be the same as in the input cards
    # controls for warmup
    warmup_phase0_number_events     = None
    warmup_phase0_number_iterations = None
    warmup_phase1_number_events     = None
    warmup_wall_minutes             = None
    warmup_preprocess               = 'alpgen_warmup_presubmit.sh'
    warmup_preprocess_args          = None
    # controls for event generation (weighted gen + unweighting)
    evtgen_phase0_number_events     = None
    evtgen_phase0_number_iterations = None
    evtgen_phase1_number_events     = None
    evtgen_nodes                    = None
    evtgen_processes_per_node       = None
    evtgen_wall_minutes             = None
    evtgen_executable               = 'alpgenCombo.sh'
    evtgen_scheduler_args           = '--mode=script'
    evtgen_preprocess               = 'alpgen_presubmit.sh'
    evtgen_postprocess              = 'alpgen_postsubmit.sh'
    working_path                    = None
    input_url                       = None
    output_url                      = None
    pdf_filename                    = 'cteq6l1.tbl'
    username                        = None
    serial_site                     = 'argo_cluster'
    parallel_site                   = None
    group_identifier                = None
    athena_input_card_executable    = 'get_alpgen_input_card.py'
    athena_postprocess              = 'alpgen_create_input_cards.py'
    athena_postprocess_log          = 'alpgen_create_input_cards.log'
    ecm                             = None
    run_number                      = None
    job_config                      = None
    evgen_job_opts                  = None
    athena_input_card_name          = 'input_card.mode_1.dat' # card output by Generate_trf
    grid_ftp_server                 = 'atlasgridftp02.hep.anl.gov'
    grid_ftp_protocol               = 'gsiftp://'
    job_working_path                = '/grid/atlas/hpc/argo/jobs'
    argo_job                        = []
    
    # Required methods

    def __init__(self):
        """ Default initialization """
        pass 
    
    def __new__(cls, *args, **kwargs):
        """ Override the __new__ method to make the class a singleton """

        if not cls.__instance:
            cls.__instance = super(RunJobHPC, cls).__new__(cls, *args, **kwargs)

        return cls.__instance

    def getRunJob(self):
        """ Return a string with the experiment name """

        return self.__runjob

    def getRunJobFileName(self):
        """ Return the filename of the module """

        return super(RunJobArgo, self).getRunJobFileName()

    # def argumentParser(self):  <-- see example in RunJob.py

    def allowLoopingJobKiller(self):
        """ Should the pilot search for looping jobs? """

        # The pilot has the ability to monitor the payload work directory. If there are no updated files within a certain
        # time limit, the pilot will consider the as stuck (looping) and will kill it. The looping time limits are set
        # in environment.py (see e.g. loopingLimitDefaultProd)

        return False

    def get_argo_job(self, job):

        ##-----------------------
        # create argo job
        ##-----------------------
        argo_job = ArgoJob()
        argo_job.input_url = None #self.GRID_FTP_PROTOCOL + self.GRID_FTP_SERVER + self.job_path
        if self.input_url is not None:
            argo_job.input_url = self.input_url
        
        argo_job.output_url = self.grid_ftp_protocol + self.grid_ftp_server + self.job_path
        if self.output_url is not None:
            argo_job.output_url = self.output_url
        
        argo_job.username           = self.username
        argo_job.group_identifier   = self.group_identifier
        
        ##-----------------------
        # create get alpgen input cards balsam job
        ##-----------------------
        
        input_file_imode0 = self.base_filename + '.input.0'
        input_file_imode1 = self.base_filename + '.input.1'
        input_file_imode2 = self.base_filename + '.input.2'
        
        input_cards_job = BalsamJob()
        input_cards_job.executable          = self.athena_input_card_executable
        input_cards_job.executable_args     = ('-e ' + self.ecm 
                                            + ' -r ' + self.run_number 
                                            + ' -o ' + self.job_config 
                                            + ' -j ' + self.evgen_job_opts)
        input_cards_job.output_files        = [input_file_imode0,
                                               input_file_imode1,
                                               input_file_imode2,
                                               self.athena_postprocess_log]
        input_cards_job.nodes               = 1
        input_cards_job.processes_per_node  = 1
        input_cards_job.wall_minutes        = 0 # running on condor cluster so does not need time
        input_cards_job.username            = self.username
        input_cards_job.target_site         = self.serial_site
        input_cards_job.postprocess         = self.athena_postprocess
        input_cards_job.postprocess_args    = (' -i ' + self.athena_input_card_name + ' -p ' + self.process 
                                               + ' -n ' + str(self.evtgen_phase1_number_events)
                                               + ' --log-filename=' + str(self.athena_postprocess_log))
        if self.warmup_phase0_number_events is not None:
            input_cards_job.postprocess_args += ' --wmp-evts-itr=' + str(self.warmup_phase0_number_events)
        if self.warmup_phase0_number_iterations is not None:
            input_cards_job.postprocess_args += ' --wmp-nitr=' + str(self.warmup_phase0_number_iterations)
        if self.warmup_phase1_number_events is not None:
            input_cards_job.postprocess_args += ' --wmp-evts=' + str(self.warmup_phase1_number_events)
        
        
        argo_job.add_job(input_cards_job)
        
        ##-----------------------
        # create warm-up job
        ##-----------------------
        
        # create grid filenames
        grid1 = self.base_filename + '.grid1'
        grid2 = self.base_filename + '.grid2'
        
        # create warmup balsam job
        warmup = BalsamJob()
        warmup.executable          = self.process + 'gen90_mpi'
        warmup.executable_args     = input_file_imode0
        warmup.input_files         = [input_file_imode0]
        warmup.output_files        = [grid1,grid2]
        warmup.nodes               = 1
        warmup.processes_per_node  = 1
        warmup.wall_minutes        = 0 # running on condor cluster so does not need time
        warmup.username            = self.username
        warmup.target_site         = self.serial_site
        warmup.preprocess          = self.warmup_preprocess
        
        argo_job.add_job(warmup)
        
        ##-----------------------
        # create event generation job
        ##-----------------------
        
        # create executable
        alpgen_exe = self.process + 'gen90_mpi_ramdisk_nomrstpdfs'
        if 'argo_cluster' in self.parallel_site: # no ramdisk needed on argo_cluster
            alpgen_exe = self.process + 'gen90_mpi'
        
        # create filenames
        unw      = self.base_filename + '.unw.gz'
        unw_par  = self.base_filename + '_unw.par'
        wgt      = self.base_filename + '.wgt'
        wgt_par  = self.base_filename + '.par'
        directoryList_before = 'directoryList_before.txt'
        directoryList_after  = 'directoryList_after.txt'
        
        # create event gen balsam job
        evtgen = BalsamJob()
        evtgen.executable          = self.evtgen_executable
        evtgen.executable_args     = (alpgen_exe + ' ' + input_file_imode1 + ' ' 
                                      + input_file_imode2 + ' ' + str(self.evtgen_processes_per_node))
        evtgen.input_files         = [grid1,
                                      grid2,
                                      input_file_imode1,
                                      input_file_imode2]
        evtgen.output_files        = [unw,
                                      unw_par,
                                      directoryList_before,
                                      directoryList_after,
                                      self.evtgen_postprocess + '.out',
                                      self.evtgen_postprocess + '.err',
                                     ]
        evtgen.preprocess          = self.evtgen_preprocess
        evtgen.postprocess         = self.evtgen_postprocess
        evtgen.postprocess_args    = self.base_filename
        evtgen.nodes               = self.evtgen_nodes
        evtgen.processes_per_node  = self.evtgen_processes_per_node
        evtgen.wall_minutes        = self.evtgen_wall_minutes
        evtgen.username            = self.username
        evtgen.scheduler_args      = self.evtgen_scheduler_args
        evtgen.target_site         = self.parallel_site
        
        argo_job.add_job(evtgen)
        
        return argo_job

    def setup(self, job, jobSite, thisExperiment):
        """ prepare the setup and get the run command list """

        # start setup time counter
        t0 = time.time()
        ec = 0

        # split up the job parameters to be able to loop over the tasks
        jobParameters = job.jobPars.split("\n")[0]
        jobTrf = job.trf.split("\n")[0]
        
        parser = optparse.OptionParser(description=' program to submit alpgen jobs like a pilot')
        parser.add_option('-p','--process',dest='process',help='Alpgen Process, i.e. zjet, wjet, wqq, etc.')
        parser.add_option('-n','--nevts',dest='nevts',help='Number of weighted events requested in input file for weighted event generation',type='int')
        parser.add_option('-g','--group-id',dest='group_identifier',help='User specified string that helps the user group jobs together.')
        parser.add_option('-e','--ecm',dest='ecm',help='Center of Mass Energy.')
        parser.add_option('-r','--run-number',dest='run_number',help='Run Number')
        parser.add_option('-c','--jobConfig',dest='jobConfig',help='Job Options that will used from the Job Config tarball, i.e. MC12JobOptions/MC12.<Run Number>.<description>.py')
        parser.add_option('-j','--evgenJobOpts',dest='evgenJobOpts',help='Job Config tarball, i.e. MC12JobOpts-XX-YY-ZZ.tar.gz')
        parser.add_option('','--dev',dest='dev',help='For development only.',action='store_true',default=False)
        parser.add_option('-q','--status-queue',dest='enable_status_queue',help='Enable the setting of the message queue parameter in the ArgoJob, which means ARGO will not send message updates for this job to the queue with its job ID.',action='store_true',default=False)
        #parser.add_option('-a','--warmup-evts',dest='warmup_evts',help='For Warmup Step: Three numbers seperated by commas giving the number of events per iteration, number of iterations, and final number of events to generate. Example: "10000,10,1000000"')
        parser.add_option('-b','--evtgen-evts',dest='evtgen_evts',help='For Event Generation Step: The number of events to generation in the event generation step. The ouput of unweighted events tends to be less so request more than you want. For example W+0jets gives you 70\%, W+1jet gives you 16%, W+2jet gives you 5%, W+3jet gives you 1%, and so on.', type='int')
        parser.add_option('-o','--num-nodes',dest='numnodes',help='number of nodes to use on destination machine',type='int')
        parser.add_option('-u','--ranks-per-node',dest='ranks_per_node',help='number of MPI ranks per node to use on destination machine',type='int')
        parser.add_option('-t','--wall-time',dest='walltime',help='The wall time to submit to the queue in minutes.',type='int')
        parser.add_option('-s','--site',dest='site',help='Balsam site name on which to run the event generation')
        parser.add_option('-x','--no-submit',dest='submit',help='do not submit the message to ARGO. For testing purposes.',action='store_false',default=True)
        parser.add_option('','--wmp-evts-itr',dest='wm_evts_per_itr',help='Warmup: Number of weighted events per interation.')
        parser.add_option('','--wmp-nitr',dest='wm_nitr',help='Warmup: Number of iterations')
        parser.add_option('','--wmp-evts',dest='wm_evts',help='Warmup: Number of final events to produce.')
        
        try: 
            options, args = parser.parse_args(shlex.split(jobParameters))
        except:
            ec = self.__error.ERR_SETUPFAILURE
            job.pilotErrorDiag = "Failure to parse job arguments"
            tolog("Failure to parse job arguments for ARGO job")
            return ec, job
            
        tolog("ARGO job will be launched with next parameters: %s" % jobParameters)
        
        self.process                          = options.process
        self.username                         = 'pilot, %s' % job.prodUserID[:120]  #os.environ['USER']
        self.group_identifier                 = options.group_identifier
        self.ecm                              = options.ecm
        self.run_number                       = options.run_number
        self.job_config                       = options.jobConfig
        self.evgen_job_opts                   = options.evgenJobOpts
        self.warmup_phase0_number_events      = options.wm_evts_per_itr
        self.warmup_phase0_number_iterations  = options.wm_nitr
        self.warmup_phase1_number_events      = options.wm_evts
        self.evtgen_phase1_number_events      = options.evtgen_evts
        self.evtgen_nodes                     = options.numnodes
        self.evtgen_processes_per_node        = options.ranks_per_node
        self.evtgen_wall_minutes              = options.walltime
        self.parallel_site                    = options.site
        
        self.dev                              = options.dev
        
        self.job_path = os.path.join(self.job_working_path,job.jobId)
        
        tolog("ARGO job path: %s" % self.job_path)
        
        self.argo_job = self.get_argo_job(job)
        
        if options.dev:
            job.serial_site = 'argo_cluster_dev'
        
        # verify that the multi-trf job is setup properly

        os.chdir(jobSite.workdir)
        tolog("Current job workdir is %s" % os.getcwd())

        job.timeSetup = int(time.time() - t0)
        tolog("Total setup time: %d s" % (job.timeSetup))

        return ec, job        

    def executePayload(self, thisExperiment, job):
        
        t0 = os.times() 
        res_tuple = None
        
        # loop over all run commands (only >1 for multi-trfs)
        getstatusoutput_was_interrupted = False
        job_status = None
        tolog("About to launch ARGO job")
        # Poll MQ for Job Status
        try:
            # Initiate MQ interface and send job
            self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID
            si = SiteInformation()
            mi = MessageInterface()
            mi.host = 'atlasgridftp02.hep.anl.gov'
            mi.port = 5671
            mi.ssl_cert = si.getSSLCertificate() #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem'
            proxy_cert_path = si.getSSLCertificate()
            mi.ssl_cert = os.path.dirname(proxy_cert_path) + "/rabbitmq-cert.pem"
            if 'X509_USER_CERT' in os.environ.keys():
                mi.ssl_cert = os.environ['X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem'
            
            mi.ssl_key  = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem'
            mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem"
            if 'X509_USER_KEY' in os.environ.keys():
                mi.ssl_key  = os.environ['X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem'
            
            #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' 
            #if 'X509_CA_CERTS' in os.environ.keys():
            #    mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem'
            #tolog("CA certs: %s" % (mi.ssl_ca_certs))
            ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            if os.path.isfile(ca_certs): 
                mi.ssl_ca_certs = ca_certs
 
            mi.exchange_name = 'argo_users'

            #Create queue to get messages about ARGO Job status from MQ
            tolog('Opening connection with MQ')
            mi.open_blocking_connection()
            tolog('Create queue [%s]  to retrieve messages with job status' % self.argo_job.job_status_routing_key)

            mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key)

            # submit ARGO job to MQ
            
            #tolog('Opening connection with MQ')
            #mi.open_blocking_connection()
            routing_key = 'argo_job'
            if self.dev:
                routing_key = 'argo_job_dev'
            tolog('Sending msg with job to ARGO')
            mi.send_msg(self.argo_job.serialize(), routing_key)
            tolog(' done sending ')
            
            # Waiting till job done or failed    
            ARGO_err_msg = ''
            while True:
                time.sleep(5)
                message = mi.receive_msg(self.argo_job.job_status_routing_key, True)
                if message[2]:
                    tolog ("Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2]))
                    job_status = ArgoJobStatus.get_from_message(message[2])
                    job.hpcStatus = job_status.state
                    rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort())

                    tolog("Extracted state: %s" % job_status.state)
                    if job_status.state == job_status.HISTORY:
                        res_tuple = (0, "Done")
                        break
                    elif job_status.is_failed():
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                    elif job_status.state == job_status.FAILED:
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                        runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg)
                        break
                time.sleep(5)
                  
            mi.close()
            tolog(' closing connection to MQ')
                
            tolog("Job State: %s" % (job_status.state))
            #job.timeExe = int(fork_job.finished - fork_job.started)
                
            ####################################################
    
        except Exception, e:
            tolog("!!FAILED!!3000!! Failed to run command %s" % str(e))
            getstatusoutput_was_interrupted = True
            res_tuple = (1, "Failed")
            self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))
        else:   
            if res_tuple[0] == 0:
                tolog("ARGO Job finished")
            else:
                tolog("ARGO Job failed: res = %s" % (str(res_tuple)))
    
        t1 = os.times()
        job.timeExe = int(round(t1[4] - t0[4]))
        
        tolog("Original exit code: %s" % (res_tuple[0]))
        if res_tuple[0] != None:
            tolog("Exit code: %s (returned from OS)" % (res_tuple[0]%255))
            res0 = res_tuple[0]%255
            if job_status:
                exitMsg = job_status.message
            else:
                exitMsg = res_tuple[1]
        else:
            tolog("Exit code: None (returned from OS, Job was canceled or interrupted)")
            res0 = None
            exitMsg = "Job was canceled by internal call"
        # check the job report for any exit code that should replace the res_tuple[0]
        
        res = (res0, res_tuple[1], exitMsg)
    
        # dump an extract of the payload output
        tolog("NOTE: For %s output, see files %s, %s" % (job.payload, job.stdout, job.stderr))
    
        # JEM job-end callback
        try:
            from JEMstub import notifyJobEnd2JEM
            notifyJobEnd2JEM(job, tolog)
        except:
            pass # don't care (fire and forget)
        
        return res, job, getstatusoutput_was_interrupted

if __name__ == "__main__":

    tolog("Starting RunJobArgo")
    # Get error handler
    error = PilotErrors()

    # Get runJob object
    runJob = RunJobArgo()

    # Define a new parent group
    os.setpgrp()

    # Protect the runJob code with exception handling
    hP_ret = False
    try:
        # always use this filename as the new jobDef module name
        import newJobDef

        jobSite = Site.Site()

        return_tuple = runJob.argumentParser()
        tolog("argumentParser returned: %s" % str(return_tuple))
        jobSite.setSiteInfo(return_tuple)

#            jobSite.setSiteInfo(argParser(sys.argv[1:]))

        # reassign workdir for this job
        jobSite.workdir = jobSite.wntmpdir

        if runJob.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(runJob.getPilotLogFilename())

        # set node info
        node = Node.Node()
        node.setNodeName(os.uname()[1])
        node.collectWNInfo(jobSite.workdir)

        # redirect stder
        sys.stderr = open("%s/runjob.stderr" % (jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % jobSite.workdir)
        # get the experiment object
        thisExperiment = getExperiment(runJob.getExperiment())

        tolog("RunJob will serve experiment: %s" % (thisExperiment.getExperiment()))

        # set the cache (used e.g. by LSST)
        #if runJob.getCache():
        #    thisExperiment.setCache(runJob.getCache())

        #JR = JobRecovery()
        try:
            job = Job.Job()
            job.setJobDef(newJobDef.job)
            job.workdir = jobSite.workdir
            job.experiment = runJob.getExperiment()
            # figure out and set payload file names
            job.setPayloadName(thisExperiment.getPayloadName(job))
        except Exception, e:
            pilotErrorDiag = "Failed to process job info: %s" % str(e)
            tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag))
            runJob.failJob(0, error.ERR_UNKNOWN, job, pilotErrorDiag=pilotErrorDiag)

        # prepare for the output file data directory
        # (will only created for jobs that end up in a 'holding' state)
        job.datadir = runJob.getParentWorkDir() + "/PandaJob_%s_data" % (job.jobId)

        # register cleanup function
        atexit.register(runJob.cleanup, job)

        # to trigger an exception so that the SIGTERM signal can trigger cleanup function to run
        # because by default signal terminates process without cleanup.
        def sig2exc(sig, frm):
            """ signal handler """

            error = PilotErrors()
            runJob.setGlobalPilotErrorDiag("!!FAILED!!3000!! SIGTERM Signal %s is caught in child pid=%d!\n" % (sig, os.getpid()))
            tolog(runJob.getGlobalPilotErrorDiag())
            if sig == signal.SIGTERM:
                runJob.setGlobalErrorCode(error.ERR_SIGTERM)
            elif sig == signal.SIGQUIT:
                runJob.setGlobalErrorCode(error.ERR_SIGQUIT)
            elif sig == signal.SIGSEGV:
                runJob.setGlobalErrorCode(error.ERR_SIGSEGV)
            elif sig == signal.SIGXCPU:
                runJob.setGlobalErrorCode(error.ERR_SIGXCPU)
            elif sig == signal.SIGBUS:
                runJob.setGlobalErrorCode(error.ERR_SIGBUS)
            elif sig == signal.SIGUSR1:
                runJob.setGlobalErrorCode(error.ERR_SIGUSR1)
            else:
                runJob.setGlobalErrorCode(error.ERR_KILLSIGNAL)
            runJob.setFailureCode(runJob.getGlobalErrorCode)
            # print to stderr
            print >> sys.stderr, runJob.getGlobalPilotErrorDiag()
            raise SystemError(sig)

        signal.signal(signal.SIGTERM, sig2exc)
        signal.signal(signal.SIGQUIT, sig2exc)
        signal.signal(signal.SIGSEGV, sig2exc)
        signal.signal(signal.SIGXCPU, sig2exc)
        signal.signal(signal.SIGBUS, sig2exc)

        # see if it's an analysis job or not
        analysisJob = isAnalysisJob(job.trf.split(",")[0])
        if analysisJob:
            tolog("User analysis job")
        else:
            tolog("Production job")
        tolog("runJobArgo received a job with prodSourceLabel=%s" % (job.prodSourceLabel))

        # setup starts here ................................................................................

        # update the job state file
        job.jobState = "setup"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # send [especially] the process group back to the pilot
        job.setState([job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())

        # prepare the setup and get the run command list
        ec, job = runJob.setup(job, jobSite, thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag))
            runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")

        # job has been updated, display it again
        job.displayJob()

        # (setup ends here) ................................................................................

        tolog("Setting stage-in state until all input files have been copied")
        job.setState(["stagein", 0, 0])
        # send the special setup string back to the pilot (needed for the log transfer on xrdcp systems)
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())

        # stage-in .........................................................................................

        # update the job state file
        job.jobState = "stagein"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # update copysetup[in] for production jobs if brokerage has decided that remote I/O should be used
        if job.transferType == 'direct':
            tolog('Brokerage has set transfer type to \"%s\" (remote I/O will be attempted for input files, any special access mode will be ignored)' %\
                  (job.transferType))
            RunJobUtilities.updateCopysetups('', transferType=job.transferType)

        # stage-in all input files (if necessary)
        job, ins, statusPFCTurl, usedFAXandDirectIO = runJob.stageIn(job, jobSite, analysisJob)
        if job.result[2] != 0:
            tolog("Failing job with ec: %d" % (ec))
            runJob.failJob(0, job.result[2], job, ins=ins, pilotErrorDiag=job.pilotErrorDiag)

        # after stageIn, all file transfer modes are known (copy_to_scratch, file_stager, remote_io)
        # consult the FileState file dictionary if cmd3 should be updated (--directIn should not be set if all
        # remote_io modes have been changed to copy_to_scratch as can happen with ByteStream files)
        # and update the run command list if necessary.
        # in addition to the above, if FAX is used as a primary site mover and direct access is enabled, then
        # the run command should not contain the --oldPrefix, --newPrefix options but use --usePFCTurl
        #if job.inFiles != ['']:
        #    runCommandList = RunJobUtilities.updateRunCommandList(runCommandList, runJob.getParentWorkDir(), job.jobId, statusPFCTurl, analysisJob, usedFAXandDirectIO)

        # (stage-in ends here) .............................................................................

        # change to running state since all input files have been staged
        tolog("Changing to running state since all input files have been staged")
        job.setState(["running", 0, 0])
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())

        # update the job state file
        job.jobState = "running"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # run the job(s) ...................................................................................

        # Set ATLAS_CONDDB if necessary, and other env vars
        RunJobUtilities.setEnvVars(jobSite.sitename)

        # execute the payload
        res, job, getstatusoutput_was_interrupted = runJob.executePayload(thisExperiment, job)

        tolog("Check ARGO output: %s" % runJob.job_path)
        # if payload leaves the input files, delete them explicitly
        if ins:
            ec = pUtil.removeFiles(job.workdir, ins)

        # payload error handling
        ed = ErrorDiagnosis()
        if res[0] == None:
            job.jobState = "cancelled"
            job.setState(["cancelled", 0, 0])
            rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())
        #else:
        #    job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode())
        
        if job.result[1] != 0 or job.result[2] != 0:
            runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag)

        # stage-out ........................................................................................

        # update the job state file
        tolog(runJob.getOutputDir()) 
        
        job.jobState = "stageout"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # verify and prepare and the output files for transfer
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(job.outFiles, job.logFile, runJob.job_path)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # update the current file states
        updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created")
        dumpFileStates(runJob.getParentWorkDir(), job.jobId)

        # create xml string to pass to dispatcher for atlas jobs
        outputFileInfo = {}
        if outs or (job.logFile and job.logFile != ''):
            # get the datasets for the output files
            dsname, datasetDict = runJob.getDatasets(job)

            # re-create the metadata.xml file, putting guids of ALL output files into it.
            # output files that miss guids from the job itself will get guids in PFCxml function

            # first rename and copy the trf metadata file for non-build jobs
            if not pUtil.isBuildJob(outs):
                runJob.moveTrfMetadata(job.workdir, job.jobId)

            # create the metadata for the output + log files
            ec, job, outputFileInfo = runJob.createFileMetadata(list(outs), job, outsDict, dsname, datasetDict, jobSite.sitename, analysisJob=analysisJob)
            if ec:
                runJob.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # move output files from workdir to local DDM area
        finalUpdateDone = False
        if outs:
            tolog("Setting stage-out state until all output files have been copied")
            job.setState(["stageout", 0, 0])
            rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())

            # stage-out output files
            
            ec, job, rf, latereg = runJob.stageOut(job, jobSite, outs, analysisJob, dsname, datasetDict, outputFileInfo)
            # error handling
            if job.result[0] == "finished" or ec == error.ERR_PUTFUNCNOCALL:
                rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort(), final=True)
            else:
                rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort(), final=True, latereg=latereg)
            if ec == error.ERR_NOSTORAGE:
                # update the current file states for all files since nothing could be transferred
                updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="not_transferred")
                dumpFileStates(runJob.getParentWorkDir(), job.jobId)

            finalUpdateDone = True
            if ec != 0:
                runJob.sysExit(job, rf)
            # (stage-out ends here) .......................................................................
        job.setState(["finished", 0, 0])
        if not finalUpdateDone:
            rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort(), final=True)
        runJob.sysExit(job)

    except Exception, errorMsg:

        error = PilotErrors()

        if runJob.getGlobalPilotErrorDiag() != "":
            pilotErrorDiag = "Exception caught in runJobArgo: %s" % (runJob.getGlobalPilotErrorDiag())
        else:
            pilotErrorDiag = "Exception caught in runJobArgo: %s" % str(errorMsg)

        if 'format_exc' in traceback.__all__:
            pilotErrorDiag += ", " + traceback.format_exc()    

        try:
            tolog("!!FAILED!!3001!! %s" % (pilotErrorDiag))
        except Exception, e:
            if len(pilotErrorDiag) > 10000:
                pilotErrorDiag = pilotErrorDiag[:10000]
                tolog("!!FAILED!!3001!! Truncated (%s): %s" % (e, pilotErrorDiag))
            else:
                pilotErrorDiag = "Exception caught in runJob: %s" % (e)
                tolog("!!FAILED!!3001!! %s" % (pilotErrorDiag))

#        # restore the proxy if necessary
#        if hP_ret:
#            rP_ret = proxyguard.restoreProxy()
#            if not rP_ret:
#                tolog("Warning: Problems with storage can occur since proxy could not be restored")
#            else:
#                hP_ret = False
#                tolog("ProxyGuard has finished successfully")

        tolog("sys.path=%s" % str(sys.path))
        cmd = "pwd;ls -lF %s;ls -lF;ls -lF .." % (runJob.getPilotInitDir())
        tolog("Executing command: %s" % (cmd))
        out = commands.getoutput(cmd)
        tolog("%s" % (out))

        job = Job.Job()
        job.setJobDef(newJobDef.job)
        job.pilotErrorDiag = pilotErrorDiag
        job.result[0] = "failed"
        if runJob.getGlobalErrorCode() != 0:
            job.result[2] = runJob.getGlobalErrorCode()
        else:
            job.result[2] = error.ERR_RUNJOBEXC
        tolog("Failing job with error code: %d" % (job.result[2]))
        # fail the job without calling sysExit/cleanup (will be called anyway)
        runJob.failJob(0, job.result[2], job, pilotErrorDiag=pilotErrorDiag, docleanup=False)