# Copyright (c) 2015 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.

import functools
import json
import socket
import urllib.error
import urllib.parse
import urllib.request

import Tac
import Tracing

traceHandle = Tracing.defaultTraceHandle()
t0 = traceHandle.trace0
t1 = traceHandle.trace1
t2 = traceHandle.trace2
t3 = traceHandle.trace3
t4 = traceHandle.trace4
t5 = traceHandle.trace5

HTTP_FETCH_TIMEOUT = 60
TASK_PROGRESS_PRECISION = 4

def removeUnuseful( collection, usefulKeys ):
   for key in set( collection ).difference( usefulKeys ):
      del collection[ key ]

@functools.total_ordering
class JobId:
   __slots__ = ( 'jtid', 'jobid' )

   def __init__( self, jtid, jobid ):
      super().__init__()
      self.jtid = jtid
      self.jobid = jobid

   def __eq__( self, other ):
      assert isinstance( other, JobId ), f"Invalid Job ID: {other!r}"
      return ( self.jtid, self.jobid ) == ( other.jtid, other.jobid )

   def __lt__( self, other ):
      assert isinstance( other, JobId ), f"Invalid Job ID: {other!r}"
      return ( self.jtid, self.jobid ) < ( other.jtid, other.jobid )

   def __hash__( self ):
      return hash( ( self.jtid, self.jobid ) ) 

class TaskId:
   __slots__ = ( 'jobid', 'taskid', 'ismap' )

   def __init__( self, jobid, taskid, ismap ):
      super().__init__()
      assert isinstance( jobid, JobId ), f"Invalid Job ID: {jobid!r}"
      self.jobid = jobid
      self.taskid = taskid
      self.ismap = ismap

class YarnStatusBase:
   __slots__ = ()

   def __init__( self ):
      pass

   def __repr__( self ):
      attributes = sorted( self.__slots__ )
      return '{}({})'.format( self.__class__.__name__, ','.join( '%s=%r' %
                          ( attr, getattr( self, attr ) ) for attr in attributes ) )

class ClusterStatus( YarnStatusBase ):
   __slots__ = ( 'activeNodes', 'decommissionedNodes', 'mapTasksRunning',
                 'reduceTasksRunning', 'state', 'allocatedMB', 'totalMB' )

   def __init__( self, activeNodes, decommissionedNodes, mapTasksRunning,
                 reduceTasksRunning, state, allocatedMB, totalMB ):
      super().__init__()
      self.activeNodes = activeNodes
      self.decommissionedNodes = decommissionedNodes
      self.mapTasksRunning = mapTasksRunning
      self.reduceTasksRunning = reduceTasksRunning
      self.state = state
      self.allocatedMB = allocatedMB
      self.totalMB = totalMB

class JobStatus( YarnStatusBase ):
   __slots__ = ( 'jobid', 'name', 'user', 'mapProgress', 'reduceProgress', 'state',
                 'startTime', 'diagnostics', 'queueName', 'url' )

   def __init__( self, jobid, name, user, mapProgress, reduceProgress, state,
                 startTime, diagnostics, queueName, url ):
      super().__init__()
      self.jobid = jobid
      self.name = name
      self.user = user
      self.mapProgress = mapProgress
      self.reduceProgress = reduceProgress
      self.state = state
      self.startTime = startTime
      self.diagnostics = diagnostics
      self.queueName = queueName
      self.url = url

class TaskStatus( YarnStatusBase ):
   __slots__ = ( 'taskid', 'taskType', 'progress', 'state', 'start', 'end', 
                 'attempts', 'counters' )

   def __init__( self, taskid, taskType, progress, state, start, end, attempts,
                 counters):
      assert isinstance( counters, TaskCounters ), \
            f"Invalid counters: {counters!r}"
      super().__init__()
      self.taskid = taskid
      self.taskType = taskType
      self.progress = progress
      self.state = state
      self.start = start
      self.end = end
      self.attempts = attempts
      self.counters = counters

class TaskAttemptStatus( YarnStatusBase ):
   __slots__ = ( 'attemptid', 'taskType', 'progress', 'state', 'status', 'start',
                 'end', 'nodeHttpAddress', 'counters' )

   def __init__( self, attemptid, taskType, progress, state, start, end,
                 nodeHttpAddress, counters ):
      super().__init__()
      self.attemptid = attemptid
      self.taskType = taskType
      self.progress = progress
      self.state = state
      self.status = state
      self.start = start
      self.end = end
      self.nodeHttpAddress = nodeHttpAddress
      self.counters = counters

class TaskCounters( YarnStatusBase ):
   __slots__ = ( 'hdfsBytesRead', 'hdfsBytesWritten', 'reduceShuffleBytes' )

   def __init__( self, hdfsBytesRead, hdfsBytesWritten, reduceShuffleBytes ):
      super().__init__()
      self.hdfsBytesRead = hdfsBytesRead
      self.hdfsBytesWritten = hdfsBytesWritten
      self.reduceShuffleBytes = reduceShuffleBytes

class RestApiException( Exception ):
   """ Base class for all Rest Api exceptions. """

class RestInvalidUrlException( RestApiException ):
   """ Exception raised when Rest Api Url is invalid. """

class RestInvalidDataException( RestApiException ):
   """ Exception raised when Rest Api Data is invalid. """

class RestUrlTimeoutException( RestApiException ):
   """ Exception raised when Rest Api times out. """

class RestIncompleteDataException( RestApiException ):
   """ Exception raised when Rest Api Data is incomplete. """

class RestApiClient:
   """REST API client to communicate with Yarn."""
   cacheEnabled_ = True
   cache_ = {}
   cacheHits_ = 0
   cacheMisses_ = 0
   __slots__ = ( 'name_', 'baseUrl_' )

   def __init__( self, name, baseUrl ):
      t2( 'Initializing RestApiClient with name:', name, 'baseUrl:', baseUrl )
      self.name_ = name
      self.baseUrl_ = baseUrl

   @classmethod
   def enableCache( cls ):
      RestApiClient.cacheEnabled_ = True

   @classmethod
   def disableCache( cls ):
      RestApiClient.cacheEnabled_ = False

   @classmethod
   def clearCache( cls, name ):
      t3( 'RestApiClient::clearCache for:', name )
      t3( 'RestApiClient Cache statistics: %d hits / %d misses / %d total' %
          ( RestApiClient.cacheHits_, RestApiClient.cacheMisses_,
            RestApiClient.cacheHits_ + RestApiClient.cacheMisses_ ) )
      if name in RestApiClient.cache_:
         del RestApiClient.cache_[ name ]
      RestApiClient.cacheHits_ = 0
      RestApiClient.cacheMisses_ = 0

   def fetch( self, path, useCache=False ):
      url = urllib.parse.urljoin( self.baseUrl_, path )
      if ( RestApiClient.cacheEnabled_ and useCache and
           self.name_ in RestApiClient.cache_ and
           url in RestApiClient.cache_[ self.name_ ] ):
         t4( 'Cache hit for url:', url )
         RestApiClient.cacheHits_ += 1
         return RestApiClient.cache_[ self.name_ ][ url ]

      try:
         # note this may return bytes for py3, or a string for py2. See below.
         # pylint: disable-next=consider-using-with
         data = urllib.request.urlopen( url, timeout=HTTP_FETCH_TIMEOUT )
      except urllib.error.URLError as exc:
         t0( 'Error when fetching url:', url, 'error:', str( exc ) )
         raise RestInvalidUrlException( 'Invalid Url: %s, Error: %s' %
                                        ( url, str( exc ) ) ) from exc
      except socket.timeout as e:
         t0( 'Timeout when fetching url:', url )
         raise RestUrlTimeoutException( 'Timeout for Url: %s' % url ) from e

      try:
         # This will work for binary data or string. json.load will treat
         # binary as UTF-8 encoded, and strings as, well, strings. See above.
         retValue = json.load( data, strict=False )
      except ValueError as e:
         t0( 'Data fetched from url:', url, 'is not in json format' )
         raise RestInvalidDataException( 'Invalid data from url: %s' % url ) from e
      finally:
         data.close()

      t5( 'Data fetched from url:', url, 'is:', retValue )

      if RestApiClient.cacheEnabled_ and useCache:
         t4( 'Cache miss for url:', url )
         RestApiClient.cacheMisses_ += 1
         if self.name_ not in RestApiClient.cache_:
            RestApiClient.cache_[ self.name_ ] = {}
         RestApiClient.cache_[ self.name_ ][ url ] = retValue

      return retValue

def ensure( condition, message ):
   """Like an assert to use to validate conditions while reading responses.

   Args:
     - condition: A boolean.  If False, causes an exception to be raised.
     - message: The message used to construct the exception.  Must be a string
       or a callable.  If a callable, will be called and must return a string.
   Raises:
     RestIncompleteDataException
   """
   if not condition:
      if callable( message ):
         message = message()
      t0( "Assertion failed", message )
      raise RestIncompleteDataException( message )

class ResourceManagerClient( RestApiClient ):
   """REST API client to communicate with the Resource Manager."""
   __slots__ = ( 'host_', 'port_' )

   def __init__( self, name, host, port ):
      t2( 'Initializing ResourceManagerClient with name: %s, host: %s, port: %d' %
          ( name, host, port ) )
      baseUrl = 'http://%s:%d' % ( host, port )
      super().__init__( name, baseUrl )
      self.host_ = host
      self.port_ = port

   def getClusterInfo( self ):
      t2( 'ResourceManagerClient::getClusterInfo' )
      infoPath = 'ws/v1/cluster/info'
      data = self.fetch( infoPath )
      ensure( 'clusterInfo' in data,
              lambda: 'clusterInfo is missing, url: %s, data: %s' %
                 ( infoPath, data ) )

      clusterInfo = data[ 'clusterInfo' ]
      ensure( 'state' in clusterInfo,
              lambda: 'state is missing, url: %s, data: %s' %
                 ( infoPath, clusterInfo ) )

      return clusterInfo

   def getClusterMetrics( self ):
      t2( 'ResourceManagerClient::getClusterMetrics' )
      metricsPath = 'ws/v1/cluster/metrics'
      data = self.fetch( metricsPath )
      ensure( 'clusterMetrics' in data,
              lambda: 'clusterMetrics is missing, url: %s, data: %s' %
                 ( metricsPath, data ) )

      clusterMetrics = data[ 'clusterMetrics' ]
      requiredKeys = ( 'decommissionedNodes', 'allocatedMB', 'totalMB' )
      ensure( set( requiredKeys ).issubset( set( clusterMetrics ) ),
              lambda: 'One or more keys among %s are missing, url: %s, data: %s' %
                 ( requiredKeys, metricsPath, clusterMetrics ) )

      return clusterMetrics

   def getClusterNodes( self ):
      t2( 'ResourceManagerClient::getClusterNodes' )
      nodesPath = 'ws/v1/cluster/nodes'
      data = self.fetch( nodesPath )
      if ( not data or 'nodes' not in data or
           not data[ 'nodes' ] or 'node' not in data[ 'nodes' ] ):
         return []

      nodes = data[ 'nodes' ][ 'node' ]
      for node in nodes:
         ensure( 'state' in node,
                 lambda: 'state is missing, url: %s, data: %s' %
                    ( nodesPath, nodes ) )

      return nodes

   def getClusterApps( self, state=None ):
      t2( 'ResourceManagerClient::getClusterApps' )
      appsPath = 'ws/v1/cluster/apps'
      if state:
         appsPath += '?states=' + state

      data = self.fetch( appsPath, useCache=True )
      if ( not data or 'apps' not in data or
           not data[ 'apps' ] or 'app' not in data[ 'apps' ] ):
         return []

      apps = [ app for app in data[ 'apps' ][ 'app' ]
               if app.get( 'applicationType' ) == 'MAPREDUCE' ]

      requiredKeys = ( 'state', 'trackingUrl', 'trackingUI', 'queue',
                       'applicationType' )
      usefulKeys = requiredKeys
      for app in apps:
         ensure( set( requiredKeys ).issubset( set( app ) ),
                 lambda: 'One or more keys among %s are missing, url: %s, '
                    'data: %s' % ( requiredKeys, appsPath, data ) )
         # Remove unused keys from App status to save cache memory.
         removeUnuseful( app, usefulKeys )

      return apps

   def getApplication( self, appId ):
      t2( 'ResourceManagerClient::getApplication' )
      appPath = 'ws/v1/cluster/apps/%s' % appId

      data = self.fetch( appPath, useCache=True )
      if not data or 'app' not in data:
         return None

      app = data[ 'app' ]
      if app.get( 'applicationType' ) != 'MAPREDUCE':
         return None

      requiredKeys = ( 'state', 'trackingUrl', 'trackingUI', 'queue',
                       'applicationType' )
      usefulKeys = requiredKeys
      ensure( set( requiredKeys ).issubset( set( app ) ),
              lambda: 'One or more keys among %s are missing, url: %s, data: %s' % 
                 ( requiredKeys, appPath, data ) )
      # Remove unused keys from App status to save cache memory.
      removeUnuseful( app, usefulKeys )

      return app

class ApplicationMasterClient( RestApiClient ):
   """REST API client to communicate with Application Master."""

   def __init__( self, name, baseUrl ):
      t2( 'Initializing ApplicationMasterClient with name:', name,
          'baseUrl:', baseUrl )
      super().__init__( name, baseUrl )

   def getAppJobs( self ):
      t2( 'ApplicationMasterClient::getAppJobs' )
      jobsPath = 'ws/v1/mapreduce/jobs'

      data = self.fetch( jobsPath, useCache=True )
      if ( not data or 'jobs' not in data or
           not data[ 'jobs' ] or 'job' not in data[ 'jobs' ] ):
         return []

      jobs = data[ 'jobs' ][ 'job' ]
      requiredKeys = ( 'state', 'id', 'name', 'user', 'startTime', 'mapsRunning',
                       'mapProgress', 'reducesRunning', 'reduceProgress',
                       'diagnostics' )
      usefulKeys = requiredKeys
      for job in jobs:
         ensure( set( requiredKeys ).issubset( set( job ) ),
                 lambda: 'One or more keys among %s are missing, url: %s, '
                    'data: %s' % ( requiredKeys, jobsPath, jobs ) )
         # Remove unused keys from Job status to save cache memory.
         removeUnuseful( job, usefulKeys )

      return jobs

   def getJobTasks( self, jobId ):
      t2( 'ApplicationMasterClient::getJobTasks' )
      tasksPath = 'ws/v1/mapreduce/jobs/%s/tasks'

      data = self.fetch( tasksPath % jobId, useCache=True )
      if ( not data or 'tasks' not in data or
           not data[ 'tasks' ] or 'task' not in data[ 'tasks' ] ):
         return []

      tasks = data[ 'tasks' ][ 'task' ]
      requiredKeys = ( 'state', 'id' )
      usefulKeys = requiredKeys
      for task in tasks:
         ensure( set( requiredKeys ).issubset( set( task ) ),
                 lambda: 'One or more keys among %s are missing, url: %s, '
                    'data: %s' % ( requiredKeys, tasksPath, tasks ) )
         # Remove unused keys from Task status to save cache memory.
         removeUnuseful( task, usefulKeys )

      return tasks

   def getTaskAttempts( self, jobId, taskId ):
      t2( 'ApplicationMasterClient::getTaskAttempts' )
      taskAttemptsPath = 'ws/v1/mapreduce/jobs/%s/tasks/%s/attempts'

      data = self.fetch( taskAttemptsPath % ( jobId, taskId ), useCache=True )
      if ( not data or 'taskAttempts' not in data or not data[ 'taskAttempts' ] or
           'taskAttempt' not in data[ 'taskAttempts' ] ):
         return []

      attempts = data[ 'taskAttempts' ][ 'taskAttempt' ]
      requiredKeys = ( 'state', 'id', 'type', 'progress', 'startTime', 'finishTime' )
      usefulKeys = ( 'state', 'id', 'type', 'progress', 'startTime', 'finishTime',
                     'nodeHttpAddress' )
      for attempt in attempts:
         ensure( set( requiredKeys ).issubset( set( attempt ) ),
                 lambda: 'One or more keys among %s are missing, url: %s, '
                    'data: %s' % ( requiredKeys, taskAttemptsPath, attempts ) )
         # Remove unused keys from Task Attempt status to save cache memory.
         removeUnuseful( attempt, usefulKeys )

      return attempts

   def getTaskCounters( self, jobId, taskId ):
      t2( 'ApplicationMasterClient::getTaskCounters' )
      taskCountersPath = 'ws/v1/mapreduce/jobs/%s/tasks/%s/counters'

      taskCountersData = self.fetch( taskCountersPath % ( jobId, taskId ) )
      if ( not taskCountersData or 'jobTaskCounters' not in taskCountersData or
           not taskCountersData[ 'jobTaskCounters' ] or
           'taskCounterGroup' not in taskCountersData[ 'jobTaskCounters' ] ):
         return []

      return taskCountersData[ 'jobTaskCounters' ][ 'taskCounterGroup' ]

   def getTaskAttemptCounters( self, jobId, taskId, attemptId ):
      t2( 'ApplicationMasterClient::getTaskAttemptCounters' )
      attemptCountersPath = 'ws/v1/mapreduce/jobs/%s/tasks/%s/attempts/%s/counters'

      attemptCountersData = self.fetch( attemptCountersPath %
                                        ( jobId, taskId, attemptId ) )
      if ( not attemptCountersData or
           'jobTaskAttemptCounters' not in attemptCountersData or
           not attemptCountersData[ 'jobTaskAttemptCounters' ] or
           'taskAttemptCounterGroup' not in
              attemptCountersData[ 'jobTaskAttemptCounters' ] ):
         return []

      return ( attemptCountersData[ 'jobTaskAttemptCounters' ]
               [ 'taskAttemptCounterGroup' ] )

class JobHistoryClient( RestApiClient ):
   """REST API client to communicate with Job History Server."""
   __slots__ = ( 'host_', 'port_' )

   def __init__( self, name, host, port ):
      t2( 'Initializing JobHistoryClient with name: %s, host: %s, port: %d',
          ( name, host, port ) )
      baseUrl = 'http://%s:%d' % ( host, port )
      super().__init__( name, baseUrl )
      self.host_ = host
      self.port_ = port

   def getJobTasks( self, jobId ):
      t2( 'JobHistoryClient::getJobTasks' )
      tasksPath = 'ws/v1/history/mapreduce/jobs/%s/tasks'

      data = self.fetch( tasksPath % jobId )
      if ( not data or 'tasks' not in data or
           not data[ 'tasks' ] or 'task' not in data[ 'tasks' ] ):
         return []

      tasks = data[ 'tasks' ][ 'task' ]
      requiredKeys = ( 'state', 'id' )
      for task in tasks:
         ensure( set( requiredKeys ).issubset( set( task ) ),
                 lambda: 'One or more keys among %s are missing, url: %s, '
                    'data: %s' % ( requiredKeys, tasksPath, tasks ) )

      return tasks

   def getTaskAttempts( self, jobId, taskId ):
      t2( 'JobHistoryClient::getTaskAttempts' )
      taskAttemptsPath = 'ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts'

      data = self.fetch( taskAttemptsPath % ( jobId, taskId ) )
      if ( not data or 'taskAttempts' not in data or not data[ 'taskAttempts' ] or
           'taskAttempt' not in data[ 'taskAttempts' ] ):
         return []

      attempts = data[ 'taskAttempts' ][ 'taskAttempt' ]
      requiredKeys = ( 'state', 'id', 'type', 'progress', 'startTime', 'finishTime' )
      for attempt in attempts:
         ensure( set( requiredKeys ).issubset( set( attempt ) ),
                 lambda: 'One or more keys among %s are missing, url: %s, '
                    'data: %s' % ( requiredKeys, taskAttemptsPath, attempts ) )

      return attempts

   def getTaskCounters( self, jobId, taskId ):
      t2( 'JobHistoryClient::getTaskCounters' )
      taskCountersPath = 'ws/v1/history/mapreduce/jobs/%s/tasks/%s/counters'

      taskCountersData = self.fetch( taskCountersPath % ( jobId, taskId ) )
      if ( not taskCountersData or 'jobTaskCounters' not in taskCountersData or
           not taskCountersData[ 'jobTaskCounters' ] or
           'taskCounterGroup' not in taskCountersData[ 'jobTaskCounters' ] ):
         return []

      return taskCountersData[ 'jobTaskCounters' ][ 'taskCounterGroup' ]

   def getTaskAttemptCounters( self, jobId, taskId, attemptId ):
      t2( 'ApplicationMasterClient::getTaskAttemptCounters' )
      attemptCountersPath = \
            'ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts/%s/counters'

      attemptCountersData = self.fetch( attemptCountersPath %
                                        ( jobId, taskId, attemptId ) )
      if ( not attemptCountersData or
           'jobTaskAttemptCounters' not in attemptCountersData or
           not attemptCountersData[ 'jobTaskAttemptCounters' ] or
           'taskAttemptCounterGroup' not in
               attemptCountersData[ 'jobTaskAttemptCounters' ] ):
         return []

      return ( attemptCountersData[ 'jobTaskAttemptCounters' ]
               [ 'taskAttemptCounterGroup' ] )

class YarnClient:
   """ Client abstraction that provides APIs to obtain information from Yarn. """

   CLUSTER_STATES = {
      "NOTINITED" : "initializing",
      "INITED" : "initializing",
      "STARTED" : "running",
      "STOPPED" : "initializing",
   }

   JOB_STATES = {
      "NEW" : "prep",
      "SCHEDULED" : "prep",
      "RUNNING" : "running",
      "SUCCEEDED" : "succeeded",
      "FAILED" : "failed",
      "KILL_WAIT" : "killed",
      "KILLED" : "killed",
   }

   TASK_STATES = {
      "NEW" : "pending",
      "SCHEDULED" : "pending",
      "RUNNING" : "running",
      "SUCCEEDED" : "complete",
      "FAILED" : "failed",
      "KILL_WAIT" : "killed",
      "KILLED" : "killed",
   }

   TASK_ATTEMPT_STATES = {
      "NEW" : "pending",
      "UNASSIGNED" : "pending",
      "ASSIGNED" : "pending",
      "STARTING" : "pending",
      "RUNNING" : "running",
      "COMMIT_PENDING" : "running",
      "SUCCESS_CONTAINER_CLEANUP" : "complete",
      "SUCCEEDED" : "complete",
      "FAIL_CONTAINER_CLEANUP" : "failed",
      "FAIL_TASK_CLEANUP" : "failed",
      "FAILED" : "failed",
      "KILL_CONTAINER_CLEANUP" : "killed",
      "KILL_TASK_CLEANUP" : "killed",
      "KILLED" : "killed",
   }

   __slots__ = ( 'name_', 'rmHost_', 'rmPort_', 'jhHost_', 'jhPort_', 'rm', 'jh' )

   def __init__( self, name, rmHost, rmPort, jhHost, jhPort ):
      t2( 'Initializing YarnClient with name: %s, Resource Manager %s:%d, '
          'History Server %s:%d' % ( name, rmHost, rmPort, jhHost, jhPort ) )
      self.name_ = name
      self.rmHost_ = rmHost
      self.rmPort_ = rmPort
      self.jhHost_ = jhHost
      self.jhPort_ = jhPort
      self.rm = ResourceManagerClient( name=name, host=rmHost, port=rmPort )
      self.jh = JobHistoryClient( name=name, host=jhHost, port=jhPort )

   def enableCache( self ):
      RestApiClient.enableCache()

   def disableCache( self ):
      RestApiClient.disableCache()

   def clearCache( self ):
      RestApiClient.clearCache( self.name_ )

   # Get Cluster Status.
   def getClusterStatus( self ):
      t2( 'YarnClient::getClusterStatus' )

      clusterInfo = self.rm.getClusterInfo()
      state = YarnClient.CLUSTER_STATES[ clusterInfo[ 'state' ] ]

      clusterNodes = self.rm.getClusterNodes()
      activeNodes = [ node[ 'id' ] for node in clusterNodes
                      if node[ 'state' ] == 'RUNNING' ]

      clusterMetrics = self.rm.getClusterMetrics()
      decommissionedNodes = clusterMetrics[ 'decommissionedNodes' ]
      allocatedMB = clusterMetrics[ 'allocatedMB' ]
      totalMB = clusterMetrics[ 'totalMB' ]

      mapTasksRunning = 0
      reduceTasksRunning = 0
      apps = self.rm.getClusterApps( state='RUNNING' )
      for app in apps:
         am = ApplicationMasterClient( self.name_, app[ 'trackingUrl' ] )
         jobs = am.getAppJobs()
         for job in jobs:
            mapTasksRunning += job[ 'mapsRunning' ]
            reduceTasksRunning += job[ 'reducesRunning' ]

      return ClusterStatus( activeNodes, decommissionedNodes, mapTasksRunning,
                            reduceTasksRunning, state, allocatedMB, totalMB )

   # Get all running jobs in cluster.
   def getRunningJobs( self ):
      t2( 'YarnClient::getRunningJobs' )
      runningJobs = []

      apps = self.rm.getClusterApps( state='RUNNING' )
      for app in apps:
         am = ApplicationMasterClient( self.name_, app[ 'trackingUrl' ] )
         jobs = am.getAppJobs()
         for job in jobs:
            if job[ 'state' ] != 'RUNNING':
               continue
            mapProgress = round( job[ 'mapProgress' ] / 100,
                                 TASK_PROGRESS_PRECISION )
            reduceProgress = round( job[ 'reduceProgress' ] / 100,
                                    TASK_PROGRESS_PRECISION )
            state = self.JOB_STATES[ job[ 'state' ] ]
            jobStatus = JobStatus( job[ 'id' ], job[ 'name' ], job[ 'user' ],
                                   mapProgress, reduceProgress, state,
                                   job[ 'startTime' ], job[ 'diagnostics' ],
                                   app[ 'queue' ], app[ 'trackingUrl' ] )
            runningJobs.append( jobStatus )

      return runningJobs

   # Get all running task attempts in cluster.
   def getRunningTaskAttempts( self ):
      t2( 'YarnClient::getRunningTaskAttempts' )
      runningAttempts = []

      apps = self.rm.getClusterApps( state='RUNNING' )
      for app in apps:
         am = ApplicationMasterClient( self.name_, app[ 'trackingUrl' ] )
         jobs = am.getAppJobs()
         for job in jobs:
            if job[ 'state' ] != 'RUNNING':
               continue
            tasks = am.getJobTasks( job[ 'id' ] )
            for task in tasks:
               if task[ 'state' ] != 'RUNNING':
                  continue
               attempts = am.getTaskAttempts( job[ 'id' ], task[ 'id' ] )
               for attempt in attempts:
                  if attempt[ 'state' ] != 'RUNNING':
                     continue
                  state = YarnClient.TASK_ATTEMPT_STATES[ attempt[ 'state' ] ]
                  attemptStatus = TaskAttemptStatus( attempt[ 'id' ],
                                                     attempt[ 'type' ],
                                                     round( attempt[ 'progress' ] /
                                                            100,
                                                            TASK_PROGRESS_PRECISION
                                                          ),
                                                     state,
                                                     attempt[ 'startTime' ],
                                                     attempt[ 'finishTime' ],
                                                     attempt[ 'nodeHttpAddress' ],
                                                     None )
                  runningAttempts.append( attemptStatus )
 
      return runningAttempts

   # Get task attempts for given Job Id.
   def getTaskAttempts( self, jobId ):
      t2( 'YarnClient::getTaskAttempts for jobId:', jobId )
      attemptStatuses = []

      appId = jobId.replace( 'job', 'application' )
      app = self.rm.getApplication( appId )
      if not app or 'trackingUI' not in app or 'trackingUrl' not in app:
         t0( 'No trackingUrl in app:', app )
         return []

      if app[ 'trackingUI' ] == 'ApplicationMaster':
         client = ApplicationMasterClient( self.name_, app[ 'trackingUrl' ] )
      elif app[ 'trackingUI' ] == 'History':
         client = self.jh
      else:
         t0( 'Unknown trackingUI in app:', app )
         return []

      tasks = client.getJobTasks( jobId )
      for task in tasks:
         attempts = client.getTaskAttempts( jobId, task[ 'id' ] )
         for attempt in attempts:
            state = YarnClient.TASK_ATTEMPT_STATES[ attempt[ 'state' ] ]
            attemptStatus = TaskAttemptStatus( attempt[ 'id' ], attempt[ 'type' ],
                                               round( attempt[ 'progress' ] / 100,
                                                      TASK_PROGRESS_PRECISION ),
                                               state,
                                               attempt[ 'startTime' ],
                                               attempt[ 'finishTime' ],
                                               attempt.get( 'nodeHttpAddress' ),
                                               None )
            attemptStatuses.append( attemptStatus )

      return attemptStatuses

   # Get counters for given Task Attempt Ids.
   # All attemptIds must belong to the same Job.
   def getTaskAttemptCounters( self, attemptIds ):
      t2( 'YarnClient::getTaskAttemptCounters for attemptIds:',  attemptIds )
      taskAttemptCounters = []

      if not attemptIds:
         return taskAttemptCounters

      if isinstance( attemptIds, str ) or not hasattr(
            attemptIds, '__iter__' ):
         attemptIds = [ attemptIds ]

      attemptId = attemptIds[ 0 ]
      taskId = attemptId[ : attemptId.rfind( '_' ) ].replace( 'attempt', 'task' )
      jobId = taskId[ : taskId.rfind( '_', 0, taskId.rfind( '_' ) ) ].replace(
            'task', 'job' )
      appId = jobId.replace( 'job', 'application' )
      app = self.rm.getApplication( appId )
      if not app or 'trackingUI' not in app or 'trackingUrl' not in app:
         t0( 'No trackingUrl in app:', app )
         return []

      if app[ 'trackingUI' ] == 'ApplicationMaster':
         client = ApplicationMasterClient( self.name_, app[ 'trackingUrl' ] )
      elif app[ 'trackingUI' ] == 'History':
         client = self.jh
      else:
         t0( 'Unknown trackingUI in app:', app )
         return []

      for attemptId in attemptIds:
         taskId = attemptId[ : attemptId.rfind( '_' ) ].replace( 'attempt', 'task' )

         hdfsBytesRead = 0
         hdfsBytesWritten = 0
         reduceShuffleBytes = 0
         attemptCounterGroups = client.getTaskAttemptCounters( jobId, taskId,
                                                               attemptId )
         for attemptCounterGroup in attemptCounterGroups:
            if ( ( attemptCounterGroup.get( 'counterGroupName' ) !=
                     'org.apache.hadoop.mapreduce.FileSystemCounter' ) and
                 ( attemptCounterGroup.get( 'counterGroupName' ) !=
                     'org.apache.hadoop.mapreduce.TaskCounter' ) ):
               continue

            attemptCounters = attemptCounterGroup.get( 'counter', [] )
            for attemptCounter in attemptCounters:
               if attemptCounter.get( 'name' ) == 'HDFS_BYTES_READ':
                  hdfsBytesRead = attemptCounter.get( 'value', 0 )
               elif attemptCounter.get( 'name' ) == 'HDFS_BYTES_WRITTEN':
                  hdfsBytesWritten = attemptCounter.get( 'value', 0 )
               elif attemptCounter.get( 'name' ) == 'REDUCE_SHUFFLE_BYTES':
                  reduceShuffleBytes = attemptCounter.get( 'value', 0 )

         taskAttemptCounters.append( TaskCounters( hdfsBytesRead, hdfsBytesWritten,
                                                   reduceShuffleBytes ) )

      return taskAttemptCounters
