Source code for radical.pilot.raptor.worker


import io
import os
import sys
import time
import shlex

import threading         as mt

import radical.utils     as ru

from .. import states    as rps
from .. import constants as rpc

from ..pytask           import PythonTask
from ..task_description import TASK_FUNC, TASK_METH, TASK_EXEC
from ..task_description import TASK_PROC, TASK_SHELL, TASK_EVAL


# ------------------------------------------------------------------------------
#
[docs]class Worker(object):
    '''
    Implement the Raptor protocol for dispatching multiple Tasks on persistent
    resources.
    '''

    # --------------------------------------------------------------------------
    #
    def __init__(self, manager, rank, raptor_id):

        self._manager   = manager
        self._rank      = rank
        self._raptor_id = raptor_id
        self._reg_event = mt.Event()
        self._reg_addr  = os.environ['RP_REGISTRY_ADDRESS']
        self._sbox      = os.environ['RP_TASK_SANDBOX']
        self._uid       = os.environ['RP_TASK_ID']
        self._sid       = os.environ['RP_SESSION_ID']
        self._ranks     = int(os.environ['RP_RANKS'])

        self._reg       = ru.zmq.RegistryClient(url=self._reg_addr)
        self._cfg       = ru.Config(cfg=self._reg['cfg'])

        self._hb_delay  = self._reg['rcfg.raptor.hb_delay']

        self._log  = ru.Logger(name=self._uid,
                               ns='radical.pilot.worker',
                               level=self._cfg.log_lvl,
                               debug=self._cfg.debug_lvl,
                               targets=self._cfg.log_tgt,
                               path=self._cfg.path)
        self._prof = ru.Profiler(name='%s.%04d' % (self._uid, self._rank),
                                 ns='radical.pilot.worker',
                                 path=self._sbox)

        # register for lifetime management messages on the control pubsub
        psbox     = os.environ['RP_PILOT_SANDBOX']
        state_cfg = self._reg['bridges.%s' % rpc.STATE_PUBSUB]
        ctrl_cfg  = self._reg['bridges.%s' % rpc.CONTROL_PUBSUB]

        ru.zmq.Subscriber(rpc.STATE_PUBSUB, url=state_cfg['addr_sub'],
                          log=self._log, prof=self._prof, cb=self._state_cb,
                          topic=rpc.STATE_PUBSUB)
        ru.zmq.Subscriber(rpc.CONTROL_PUBSUB, url=ctrl_cfg['addr_sub'],
                          log=self._log, prof=self._prof, cb=self._control_cb,
                          topic=rpc.CONTROL_PUBSUB)

        # we push hertbeat and registration messages on that pubsub also
        self._ctrl_pub = ru.zmq.Publisher(rpc.CONTROL_PUBSUB,
                                          url=ctrl_cfg['addr_pub'],
                                          log=self._log,
                                          prof=self._prof)
        # let ZMQ settle
        time.sleep(1)

        self._hb_register_count = 60
        # run heartbeat thread in all ranks (one hb msg every `n` seconds)
        self._log.debug('hb delay: %s', self._hb_delay)
        self._hb_thread = mt.Thread(target=self._hb_worker)
        self._hb_thread.daemon = True
        self._hb_thread.start()

        # run worker initialization *before* starting to work on requests.
        # the worker provides these builtin methods:
        #     eval:  evaluate a piece of python code with `eval`
        #     exec:  evaluate a piece of python code with `exec`
        #     call:  execute  a method or function call
        #     proc:  execute  a command line (fork/exec)
        #     shell: execute  a shell command
        self._modes = dict()
        self.register_mode(TASK_FUNC,  self._dispatch_func)
        self.register_mode(TASK_METH,  self._dispatch_meth)
        self.register_mode(TASK_EVAL,  self._dispatch_eval)
        self.register_mode(TASK_EXEC,  self._dispatch_exec)
        self.register_mode(TASK_PROC,  self._dispatch_proc)
        self.register_mode(TASK_SHELL, self._dispatch_shell)

        # prepare base env dict used for all tasks
        # NOTE: raptor tasks run in the same environment as the raptor worker
        self._task_env = dict()
        for k,v in os.environ.items():
            if not k.startswith('RP_'):
                self._task_env[k] = v

        reg_msg = {'cmd': 'worker_register',
                   'arg': {'uid'        : self._uid,
                           'raptor_id'  : self._raptor_id,
                           'ranks'      : self._ranks}}

        # the manager (rank 0) registers the worker with the master
        if self._manager:

            self._log.debug('register: %s / %s', self._uid, self._raptor_id)
            self._ctrl_pub.put(rpc.CONTROL_PUBSUB, reg_msg)

          # # FIXME: we never unregister on termination
          # self._ctrl_pub.put(rpc.CONTROL_PUBSUB, {'cmd': 'worker_unregister',
          #                                         'arg': {'uid' : self._uid}})

        # wait for raptor response (*all* ranks*)
        self._log.debug('wait for registration to complete')
        count = 0
        while not self._reg_event.wait(timeout=5):
            if count < self._hb_register_count:
                count += 1
                if self._manager:
                    self._log.debug('re-register: %s / %s', self._uid, self._raptor_id)
                    self._ctrl_pub.put(rpc.CONTROL_PUBSUB, reg_msg)
            else:
                self.stop()
                self.join()
                self._log.error('registration with master timed out')
                raise RuntimeError('registration with master timed out')

        if self._manager:
            self._log.debug('registration with master ok')


    # --------------------------------------------------------------------------
    #
    def _hb_worker(self):

        while True:

            self._ctrl_pub.put(rpc.CONTROL_PUBSUB,
                    {'cmd': 'worker_rank_heartbeat',
                     'arg': {'uid' : self._uid,
                             'rank': self._rank}})

            time.sleep(self._hb_delay)


    # --------------------------------------------------------------------------
    #
    def _state_cb(self, topic, msgs):

        for msg in ru.as_list(msgs):

            cmd = msg['cmd']
            arg = msg['arg']

            if cmd != 'update':
                continue

            for thing in arg:

                uid   = thing['uid']
                state = thing['state']

                if uid == self._raptor_id:

                    if state in rps.FINAL + [rps.AGENT_STAGING_OUTPUT_PENDING]:
                        # master completed - terminate this worker
                        self._log.info('master %s final: %s - terminate',
                                       uid, state)
                        self.stop()
                        return False

        return True


    # --------------------------------------------------------------------------
    #
    def _control_cb(self, topic, msg):

        cmd = msg.get('cmd')
        arg = msg.get('arg')

        if cmd == 'worker_registered':

            if arg['uid'] != self._uid:
                return

            if self._reg_event.is_set():
                # registration was completed already
                return

            self._ts_addr      = arg['info']['ts_addr']
            self._res_addr_put = arg['info']['res_addr_put']
            self._req_addr_get = arg['info']['req_addr_get']

            self._reg_event.set()

        elif cmd == 'terminate':
            self.stop()
            self.join()
            sys.exit()

        elif cmd == 'worker_terminate':

            if arg['uid'] == self._uid:
                self._log.debug('worker_terminate signal')
                self.stop()
                self.join()
                sys.exit()


    # --------------------------------------------------------------------------
    #
[docs]    def get_master(self):
        '''
        The worker can submit tasks back to the master - this method will
        return a small shim class to provide that capability.  That class has
        a single method `run_task` which accepts a single `rp.TaskDescription`
        from which a `rp.Task` is created and executed.  The call then waits for
        the task's completion before returning it in a dict representation, the
        same as when passed to the master's `result_cb`.

        Note: the `run_task` call is running in a separate thread and will thus
              not block the master's progress.

        Returns:
            Master: a shim class with only one method: `run_task(td)` where
                `td` is a `TaskDescription` to run.
        '''

        # ----------------------------------------------------------------------
        class Master(object):

            def __init__(self, addr):
                self._task_service_ep = ru.zmq.Client(url=addr)

            def run_task(self, td):
                return self._task_service_ep.request('run_task', td)
        # ----------------------------------------------------------------------

        return Master(self._ts_addr)


    # --------------------------------------------------------------------------
    #
[docs]    def start(self):
        '''Start the workers main work loop.
        '''

        raise NotImplementedError('`start()` must be implemented by child class')


    # --------------------------------------------------------------------------
    #
[docs]    def stop(self):
        '''Signal the workers to stop the main work loop.
        '''

        raise NotImplementedError('`stop()` must be implemented by child class')


    # --------------------------------------------------------------------------
    #
[docs]    def join(self):
        '''Wait until the worker's main work loop completed.
        '''

        raise NotImplementedError('`join()` must be implemented by child class')


    # --------------------------------------------------------------------------
    #
[docs]    def register_mode(self, name, dispatcher) -> None:
        '''
        Register a new task execution mode that this worker can handle.
        The specified dispatcher callable should accept a single argument: the
        task to execute.

        Args:
            name (str): name of the mode to register
            dispatcher (callable): function which implements the execution mode
        '''

        if name in self._modes:
            raise ValueError('mode %s already registered' % name)

        self._modes[name] = dispatcher


    # --------------------------------------------------------------------------
    #
[docs]    def get_dispatcher(self, name):
        '''Query a registered execution mode.

        Args:
            name (str): name of execution mode to query for

        Returns:
            Callable: the dispatcher method for that execution mode
        '''

        if name not in self._modes:
            raise ValueError('mode %s unknown' % name)

        return self._modes[name]


    # --------------------------------------------------------------------------
    #
    def _dispatch_meth(self, task):
        '''
        _dispatch_meth is a simple wrapper around _dispatch_func which points to
        private methods to be called.
        '''

        task['description']['function'] = task['description']['method']

        return self._dispatch_func(task)


    # --------------------------------------------------------------------------
    #
    def _dispatch_func(self, task):
        '''
        We expect three attributes: 'function', containing the name of the
        member method or free function to call, `args`, an optional list of
        unnamed parameters, and `kwargs`, and optional dictionary of named
        parameters.

        *function* is resolved first against `locals()`, then `globals()`, then
        attributes of the implementation class (member functions of *base*, as
        provided to `MPIWorkerRank()`). Finally, an attempt is made to
        deserialize a PythonTask from *function*. The first non-null resolution
        of *function* is used as the callable.

        NOTE: MPI function tasks will get a private communicator passed as first
              unnamed argument.

        Args:
            task (Dict[str, Any]): dictionary representation of the task to
                execute

        Returns:
            Tuple[str, str, int, Any, Tuple[str, str]]:
                - standard output (str)
                - standard error (str)
                - exit code (int)
                - return value (Any)
                - exception (Tuple[type (str), message (str)])

        Raises:
            KeyError
                if the task dictionary misses required entries
            ValueError
                if `task['description']['function']` cannot be resolved
            Assert
                if `task['description']['function']` is not set
        '''

        uid  = task['uid']
        func = task['description']['function']
        assert func

        args    = task['description'].get('args',   [])
        kwargs  = task['description'].get('kwargs', {})
        py_func = False

        self._log.debug('orig args: %s : %s', args, kwargs)

        # check if `func_name` is a global name
        names   = dict(list(globals().items()) + list(locals().items()))
        to_call = names.get(func)

        # if not, check if this is a class method of this worker implementation
        if not to_call:
            to_call = getattr(self, func, None)

        # check if we have a serialized object
        if not to_call:
            self._log.debug('func serialized: %d: %s', len(func), func)

            try:
                to_call, _args, _kwargs = PythonTask.get_func_attr(func)

            except Exception:
                self._log.warn('function is not a PythonTask [%s] ', uid)

            else:
                py_func = True
                if args or kwargs:
                    raise ValueError('`args` and `kwargs` must be empty for'
                                     'PythonTask function [%s]' % uid)
                else:
                    args   = _args
                    kwargs = _kwargs

        if not to_call:
            self._log.error('no %s in \n%s\n\n%s', func, names, dir(self))
            raise ValueError('%s callable %s not found: %s' % (uid, func, task))

        comm = task.get('mpi_comm')
        if comm:
            # we have an MPI communicator we need to inject into the function's
            # arguments.
            if py_func:
                # For a `py_func` we add the communicator as `comm` kwarg if
                # that is set to None, and otherwise as first `arg` if that is
                # None.  If neither is true we'll error out.
                # NOTE that we don't change the number of arguments either way.
                if 'comm' in kwargs and kwargs['comm'] is None:
                    kwargs['comm'] = comm
                elif args and args[0] is None:
                    args[0] = comm
                else:
                    raise RuntimeError("can't inject communicator for %s: %s: %s",
                                       task['uid'], args, kwargs)
            else:
                args.insert(0, comm)

        # make sure we capture stdout / stderr
        bak_stdout = sys.stdout
        bak_stderr = sys.stderr

        strout = None
        strerr = None

        # set the task environment
        old_env = os.environ.copy()

        for k, v in task['description'].get('environment', {}).items():
            os.environ[k] = str(v)

        try:
            # redirect stdio to capture them during execution
            sys.stdout = strout = io.StringIO()
            sys.stderr = strerr = io.StringIO()

            self._prof.prof('rank_start', uid=uid)
            self._log.debug('to call %s: %s : %s', to_call, args, kwargs)
            val = to_call(*args, **kwargs)
            self._prof.prof('rank_stop', uid=uid)
            out = strout.getvalue()
            err = strerr.getvalue()
            exc = (None, None)
            ret = 0

        except Exception as e:
            self._log.exception('_call failed: %s', task['uid'])
            val = None
            out = strout.getvalue()
            err = strerr.getvalue() + ('\ncall failed: %s' % e)
            exc = (repr(e), '\n'.join(ru.get_exception_trace()))
            ret = 1

        finally:
            # restore stdio
            sys.stdout = bak_stdout
            sys.stderr = bak_stderr

            # remove communicator from args again
            if comm:
                if py_func:
                    if 'comm' in kwargs:
                        del kwargs['comm']
                    elif args:
                        args[0] = None
                else:
                    args.pop(0)

            os.environ = old_env

        self._log.debug('%s: got %s', uid, out)

        return out, err, ret, val, exc


    # --------------------------------------------------------------------------
    #
    def _dispatch_eval(self, task):
        '''
        We expect a single attribute: 'code', containing the Python
        code to be eval'ed
        '''

        uid  = task['uid']
        code = task['description']['code']
        assert code

        bak_stdout = sys.stdout
        bak_stderr = sys.stderr

        strout = None
        strerr = None

        old_env = os.environ.copy()

        for k, v in task['description'].get('environment', {}).items():
            os.environ[k] = str(v)

        try:
            # redirect stdio to capture them during execution
            sys.stdout = strout = io.StringIO()
            sys.stderr = strerr = io.StringIO()

            self._log.debug('eval [%s] [%s]', code, task['uid'])

            self._prof.prof('rank_start', uid=uid)
            val = eval(code)
            self._prof.prof('rank_stop', uid=uid)
            out = strout.getvalue()
            err = strerr.getvalue()
            exc = (None, None)
            ret = 0

        except Exception as e:
            self._log.exception('_eval failed: %s', task['uid'])
            val = None
            out = strout.getvalue()
            err = strerr.getvalue() + ('\neval failed: %s' % e)
            exc = (repr(e), '\n'.join(ru.get_exception_trace()))
            ret = 1

        finally:
            # restore stdio
            sys.stdout = bak_stdout
            sys.stderr = bak_stderr

            os.environ = old_env

        return out, err, ret, val, exc



    # --------------------------------------------------------------------------
    #
    def _dispatch_exec(self, task):
        '''
        We expect a single attribute: 'code', containing the Python code to be
        exec'ed.  The optional attribute `pre_exec` can be used for any import
        statements and the like which need to run before the executed code.
        '''

        bak_stdout = sys.stdout
        bak_stderr = sys.stderr

        strout = None
        strerr = None

        old_env = os.environ.copy()

        for k, v in task['description'].get('environment', {}).items():
            os.environ[k] = str(v)

        try:
            # redirect stdio to capture them during execution
            sys.stdout = strout = io.StringIO()
            sys.stderr = strerr = io.StringIO()

            uid  = task['uid']
            pre  = task['description'].get('pre_exec', [])
            code = task['description']['code']

            # create a wrapper function around the given code
            lines = code.split('\n')
            outer = 'def _my_exec():\n'
            for line in lines:
                outer += '    ' + line + '\n'

            # call that wrapper function via exec, and keep the return value
            src = '%s\n\n%s\n\nresult=_my_exec()' % ('\n'.join(pre), outer)

            # assign a local variable to capture the code's return value.
            loc = dict()
            self._prof.prof('rank_start', uid=uid)
            exec(src, {}, loc)                # pylint: disable=exec-used # noqa
            self._prof.prof('rank_stop', uid=uid)
            val = loc['result']
            out = strout.getvalue()
            err = strerr.getvalue()
            exc = (None, None)
            ret = 0

        except Exception as e:
            self._log.exception('_exec failed: %s', task['uid'])
            val = None
            out = strout.getvalue()
            err = strerr.getvalue() + ('\nexec failed: %s' % e)
            exc = (repr(e), '\n'.join(ru.get_exception_trace()))
            ret = 1

        finally:
            # restore stdio
            sys.stdout = bak_stdout
            sys.stderr = bak_stderr

            os.environ = old_env

        return out, err, ret, val, exc


    # --------------------------------------------------------------------------
    #
    def _dispatch_proc(self, task):
        '''
        We expect two attributes: 'executable', containing the executabele to
        run, and `arguments` containing a list of arguments (strings) to pass as
        command line arguments.  We use `sp.Popen` to run the fork/exec, and to
        collect stdout, stderr and return code
        '''

        try:
            import subprocess as sp

            uid  = task['uid']
            exe  = task['description']['executable']
            args = task['description'].get('arguments', list())
            env  = dict(self._task_env)
            env.update(task['description']['environment'])

            cmd  = '%s %s' % (exe, ' '.join([shlex.quote(arg) for arg in args]))
            self._prof.prof('rank_start', uid=uid)
            proc = sp.Popen(cmd, env=env,  stdin=None,
                            stdout=sp.PIPE, stderr=sp.PIPE,
                            close_fds=True, shell=True)
            out, err = proc.communicate()
            ret      = proc.returncode
            exc      = (None, None)
            self._prof.prof('rank_stop', uid=uid)

        except Exception as e:
            self._log.exception('proc failed: %s', task['uid'])
            out = None
            err = 'exec failed: %s' % e
            exc = (repr(e), '\n'.join(ru.get_exception_trace()))
            ret = 1

        return out, err, ret, None, exc


    # --------------------------------------------------------------------------
    #
    def _dispatch_shell(self, task):
        '''
        We expect a single attribute: 'command', containing the command
        line to be called as string.
        '''

        try:
            uid = task['uid']
            cmd = task['description']['command']
            env = dict(self._task_env)
            env.update(task['description']['environment'])

          # self._log.debug('shell: --%s--', cmd)

            self._prof.prof('rank_start', uid=uid)
            out, err, ret = ru.sh_callout(cmd, shell=True, env=env)
            exc = (None, None)
            self._prof.prof('rank_stop', uid=uid)

        except Exception as e:
            self._log.exception('_shell failed: %s', task['uid'])
            out = None
            err = 'shell failed: %s' % e
            exc = (repr(e), '\n'.join(ru.get_exception_trace()))
            ret = 1

      # os.environ = old_env

        return out, err, ret, None, exc


    # --------------------------------------------------------------------------
    #
    def hello(self, msg, sleep=0):

        print('hello %s: %.3f' % (msg, time.time()))
        time.sleep(sleep)
        print('hello %s: %.3f' % (msg, time.time()))
        return 'hello %s' % msg


# ------------------------------------------------------------------------------