# Copyright (c) 2013 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.

from datetime import datetime
import os
import re
import socket
import threading
import time
import traceback
from operator import attrgetter
import collections
from collections import defaultdict
import queue
import urllib.error
import urllib.request

import Agent
import HadoopTracerLogMsgs
import Logging
import Tac
import Tracing
import SharedMem
import Smash
import MlagMountHelper

from HadoopRpc import ConnectionClosedException
from HadoopRpc import HadoopException
from HadoopRpc import NullInstanceException
from HadoopRpc import InvalidRpcResponseException
from HadoopRpc import RemoteRpcException
from JobTrackerRpc import JobTrackerClient
from JobTrackerRpc import JobId
from JobTrackerRpc import TaskId
from YarnClient import YarnClient

# The next timeouts influence the heartBeatPeriod so make sure if they grow
# we adjust the heartbeatPeriod! HTTP_FETCH_TIMEOUT = Timeout used when fetching
# the status page from TaskTrackers' HTTP interface. RPC_TIMEOUT = Timeout
# waiting for an RPC response
HTTP_FETCH_TIMEOUT = 60  # seconds
RPC_TIMEOUT = 60 #seconds

DEFAULT_VRF_NAME = Tac.newInstance( 'Vrf::Constants' ).defaultVrfName

traceHandle = Tracing.Handle( 'HadoopTracerAgent' )
error = traceHandle.trace0
warn = traceHandle.trace1
info = traceHandle.trace2
trace = traceHandle.trace3
debug = traceHandle.trace4
rpcTrace = traceHandle.trace5
httpTrace = traceHandle.trace6
timeTrace = traceHandle.trace7
sysdbTrace = traceHandle.trace8

ClusterStatus = Tac.Type( 'HadoopTracer::ClusterStatus' )
ClusterInfo = Tac.Type( 'HadoopTracer::ClusterInfo' )
Counters = Tac.Type( 'HadoopTracer::Counters' )
JobInfo = Tac.Type( 'HadoopTracer::JobInfo' )
Status = Tac.Type( 'HadoopTracer::ClusterInfo::Status' )
Priority = Tac.Type( 'HadoopTracer::JobInfo::Priority' )
RunState = Tac.Type( 'HadoopTracer::RunState' )
NodeState = Tac.Type( 'HadoopTracer::TaskTrackerInfo::NodeState' )
NodeStateDetail = Tac.Type( 'HadoopTracer::TaskTrackerInfo::NodeStateDetail' )
JobKey = Tac.Type( 'HadoopTracer::JobKey' )
TaskTrackerJobInfo = Tac.Type( 'HadoopTracer::TaskTrackerJobInfo' )
TaskReportKey = Tac.Type( 'HadoopTracer::TaskReportKey' )
TaskReport = Tac.Type( 'HadoopTracer::TaskReport' )
TaskReportStatus = Tac.Type( 'HadoopTracer::TaskReport::Status' )
LocalJobCounter = Tac.Type( 'HadoopTracer::LocalJobCounter' )
JobHistory = Tac.Type( 'HadoopTracer::JobHistory' )
CountersPerIntfId = Tac.Type( 'HadoopTracer::CountersPerIntfId' )
BurstCounter = Tac.Type( 'HadoopTracer::BurstCounter' )

###### Auxiliary generic function
def profileTime( maxTime, curSum, curCount, start ):
   """ Calculate the time passed since start and update:
         currentSum of time spent
         currentCount of tasks
         maxTime if the current run is greater than the previous run.
   """
   curTime = time.time() - start
   if curTime > maxTime: 
      maxTime = curTime
   curSum = curSum + curTime
   curCount = curCount + 1
   return ( maxTime, curSum, curCount )


BurstCountersInfo = collections.namedtuple( 'BurstCountersInfo', 
                                     'jobId jobName inBytes outBytes' )
def milliSecToSec( timeInMilliSec ):
   """ Transform milliseconds to seconds. """
   return timeInMilliSec / 1000.0

def tacJobKey2JobId( jobId ):
   """ Transform a TAC 'HadoopTracer::JobKey' to a JobTracker.JobId """
   return JobId( jobId.jtId, jobId.jobId )

def taskReportKey2Tac( attemptId, taskId ):
   """ Transform a JobTracker.TaskId + attemptId to a Tac TaskReportKey """
   return Tac.const( TaskReportKey( int( taskId.taskid ),
                                    attemptId,
                                    taskId.ismap ) )

def taskIdFromTaskReportKey( jobId, taskReportKey ):
   return TaskId( jobid=JobId( jobid=jobId.jobId, jtid=jobId.jtId ),
                  taskid=taskReportKey.taskId, 
                  ismap=taskReportKey.isMap )

def delNotInStatusCollection( collectionName, statusColl, itemColl ):
   ''' Remove items on the status collection that are not in the itemCollection. '''
   for itemName in statusColl:
      if itemName not in itemColl:
         sysdbTrace( 'removing', collectionName, itemName )
         del statusColl[ itemName ]

def isTaskInDict( jobId, taskReportKey, dictTasks ):
   """ Check if the job is in the dictionary and then if the task is also part
       of it. """
   return jobId in dictTasks and taskReportKey in dictTasks[ jobId ]

def addBytes( jobId, intfId, counters, taskCounters ):
   """ add bytes from the taskCounters with counters and return a Counter
       nominal with the sum. """
   debug( 'Adding bytes for jobId:', jobId, 'intfId', intfId, 'counters',
          counters, 'taskCounters', taskCounters )
   hdfsBytesRead = counters.hdfsBytesRead + taskCounters.hdfsBytesRead
   hdfsBytesWritten = ( counters.hdfsBytesWritten +
                        taskCounters.hdfsBytesWritten )
   reduceShuffleBytes = ( counters.reduceShuffleBytes +
                        taskCounters.reduceShuffleBytes )
   return Counters( hdfsBytesRead=hdfsBytesRead,
                    hdfsBytesWritten=hdfsBytesWritten,
                    reduceShuffleBytes=reduceShuffleBytes )

class InconsistentHadoopDataException( HadoopException ):
   """ Data received from Hadoop is inconsistent """
   def __init__( self, msgError ):
      msg = "Inconsistent Hadoop info: %s"  % ( msgError )
      HadoopException.__init__( self, msg )

class HttpParsingErrorException( HadoopException ):
   """ Error parsing HTTP responses from tasktrackers.  """
   def __init__( self, url, msgError ):
      msg = "Error with HTTP data from URL:%s. Error detail: %s"  % ( url,
               msgError )
      HadoopException.__init__( self, msg )

class InvalidMappingException( HadoopException ):
   """ We couldn't map an RPC response to our TAC model """

class ReceivedStopException( HadoopException ):
   """ Exception to indicate we stop collecting data in the middle of a
       polling because we received a stop thread from the main activity loop
   """
   def __init__( self, description ):
      msg = "Stopped after %s" % description
      HadoopException.__init__( self, msg )

class ParsingErrorException( HadoopException ):
   """ Error parsing some field required by HadoopTracer logic.  """
   def __init__( self, data, errorMsg ):
      msg = "Error parsing data received %r. Error detail:%s"  % ( data, errorMsg )
      HadoopException.__init__( self, msg )

def chkTacTypeEnum( tacType, value, tmpData ):
   """ Verify that value is a valid enum on the tacTypeName

   Args:
      - tacType: Tac type to be used to check the value.
      - value: The value.
      - tmpData: Used to store error if value is not in TAC enum.
   Returns:
       the value or raises InvalidMappingException on failure.
   """
   if value in tacType.attributes:
      return value
   else:
      msg = f'Got {value!r} but expected a value within {tacType.attributes!r}'
      tmpData.clusterStatus.error = msg
      raise InvalidMappingException( msg )

class QueueEntry:
   """ This is the unit of data exchange between the HadoopTracer threads and
       the activity loop updating the sysdb. """
   def __init__( self, clusterName, data ) :
      self.clusterName = clusterName
      self.data = data

class TimedExceptionHandler:
   """Context manager to help with exception handling while gathering data.
   Also logs the duration of the operation for debugging.
   """

   def __init__( self, description, agent, tmpData ):
      """Constructor.

      Args:
        - description: A human-readable string of the action being undertaken.
          Used for tracing and error reporting.
        - a pointer to the agent so we can get config information
        - tmpData: We log an error on the error field in case an exception is raised.
      """
      self.description = description
      self.tmpData = tmpData
      self.agent = agent
      self.start = None

   def __enter__( self ):
      self.start = time.time()

   def __exit__( self, excType, exception, stackTrace ):
      timing = time.time() - self.start
      timeTrace( "Time to", self.description, timing )
      if self.agent.stopped():
         raise ReceivedStopException( self.description )
      if excType is not None:
         errorMsg = "Failed to %s. " % self.description
         if issubclass( excType, socket.error ):
            errorMsg += "Socket error"
         elif issubclass( excType, NullInstanceException ):
            errorMsg += "\nReceived a null RPC response caused most likely by"
            errorMsg += " a JobTracker restart."
            errorMsg += "\nError detail"
         elif issubclass( excType, RemoteRpcException ):
            errorMsg += "RPC error"
         elif issubclass( excType, InvalidRpcResponseException ):
            errorMsg += "Malformed RPC response from Hadoop"
         elif issubclass( excType, InvalidMappingException ):
            errorMsg += "Mismatch value from RPC to map to Agent model"
         elif issubclass( excType, ConnectionClosedException ):
            errorMsg += "Connection error"
         elif issubclass( excType, HttpParsingErrorException ):
            errorMsg += "HTTP parsing error"
         else:
            errorMsg += "Unexpected error of type %s" % excType.__name__

         if isinstance( self.agent, HadoopTracerMR1Thread ):
            errorMsg += ": %s on %s:%d" % ( exception, self.agent.host,
                                         self.agent.rpcPort )
         elif isinstance( self.agent, HadoopTracerMR2Thread ):
            errorMsg += ": %s on %s:%d" % ( exception,
                                            self.agent.resourceManagerHost,
                                            self.agent.resourceManagerPort )
         error( errorMsg )
         self.tmpData.clusterStatus.error = errorMsg
      return False

class BaseClass:
   """ Base class for Aux objects """
   def __repr__( self ):
      attributes = sorted( self.__dict__.items() )
      return "{}({})".format( self.__class__.__name__, ", ".join( "%s=%r" %
                    ( attr, value ) for attr, value in attributes
                      if attr[ 0 ] != "_" ) )

class AuxTaskTrackerInfo( BaseClass ):
   """ Extra info used to Build the correct TaskTrackerInfo. """
   def __init__( self, isLocal, url, name ):
      super().__init__()
      # Running jobs extracted from the HTTP pages of the tasktrackers
      self.httpTasks = {}
      self.isLocal = isLocal
      self.url = url
      self.name = name
      # dict of completed tasks for jobs running on the taskTracker
      self.completedTasks = {}
      # dict of not completed tasks ( can be killed, failed, etc ) for jobs running
      # on the taskTracker
      self.notCompletedTasks = {}

   def newRunningJob( self, jobId ):
      debug( 'new http running Job', jobId, 'for', self.name )
      self.httpTasks[ jobId ] = set()

   def newHttpTaskReport( self, jobId, key ):
      debug( 'new http task for running Job', jobId, 'taskTracker', self.name,
            'key', key )
      self.httpTasks[ jobId ].add( Tac.const( key ) )

   def newCompletedTask( self, jobId, taskId ):
      if jobId not in self.completedTasks:
         self.completedTasks[ jobId ] = set()
      self.completedTasks[ jobId ].add( taskId )

   def newNotCompletedTask( self, jobId, taskId ):
      if jobId not in self.notCompletedTasks:
         self.notCompletedTasks[ jobId ] = set()
      self.notCompletedTasks[ jobId ].add( taskId )

class TmpData( BaseClass ):
   """ Store temporary data received/processed from Hadoop RPC to build the
       TAC model """
   def __init__( self, lastUpdate, clusterName ):
      super().__init__()
      self.clusterName = clusterName
      self.clusterStatus = ClusterStatus( clusterName )
      self.lastUpdate = lastUpdate
      # dict of auxiliary info to build TaskTrackerInfo.
      self.auxTaskTrackerInfo = {}
      # dict of JobHistory.  We are not using the clusterStatus.jobHistory for fast
      # indexing while adding JobHistoryCounters
      self.jobHistory = {}
      # Flag to indicate if this is the first data retrieved from the jobTracker
      # since we started the thread activity
      self.firstPoll = True

   def clusterInfoIs( self, clusterInfo ):
      self.clusterStatus.clusterInfo = clusterInfo

   def runningJobIs( self, jobList ):
      for job in jobList:
         self.clusterStatus.newRunningJob( job.jobKey )
         self.clusterStatus.runningJob[ job.jobKey ].jobInfo = job

   def localTaskTrackerIs( self, name, shortName, ipAddress, url, isLocal, intf,
                           stateDetail ):
      self.clusterStatus.newLocalTaskTrackerInfo( name )
      self.auxTaskTrackerInfo[ name ] = AuxTaskTrackerInfo( isLocal, url, name )
      taskTrackerInfo = self.clusterStatus.localTaskTrackerInfo[ name ]
      taskTrackerInfo.shortName = shortName
      if ipAddress:
         taskTrackerInfo.ipAddress = ipAddress
      if intf:
         taskTrackerInfo.intfId = intf
      taskTrackerInfo.stateDetail = stateDetail
      info( '_localTaskTrackers', taskTrackerInfo, taskTrackerInfo.ipAddress,
            taskTrackerInfo.intfId, isLocal, stateDetail )

   def updateTaskTrackerReport( self, name, jobId, taskKey, auxTaskTracker, task ):
      """ update the Task Tracker Report with the task.
      """
      trace( 'updateTaskTrackerReport entry', name )
      debug( 'jobid', jobId, 'key:', taskKey, 'in tt:', name, 'hdfsBytesRead',
             task.counters.hdfsBytesRead, 'hdfsBytesWritten',
             task.counters.hdfsBytesWritten, 'reduceShuffleBytes',
             task.counters.reduceShuffleBytes )
      taskTracker = self.clusterStatus.localTaskTrackerInfo[ name ]
      if jobId not in taskTracker.runningJob:
         taskTracker.newRunningJob( jobId )

      runningTask = TaskReport( taskKey )
      runningTask.status = chkTacTypeEnum( TaskReportStatus, task.status,
                                           self )
      runningTask.state = task.state
      runningTask.startTime = milliSecToSec( task.start )
      runningTask.endTime = milliSecToSec( task.end )
      runningTask.progress = task.progress

      runningTask.counter = Counters( hdfsBytesRead=task.counters.hdfsBytesRead,
                              hdfsBytesWritten=task.counters.hdfsBytesWritten,
                             reduceShuffleBytes=task.counters.reduceShuffleBytes )
      taskTracker.runningJob[ jobId ].runningTask.addMember( runningTask )

   def newTaskTrackerInfo( self, status, name ):
      status.newLocalTaskTrackerInfo( name )
      sysdbTrace( 'new taskTracker', name )
      self.updateTaskTrackerTacFields( name, status.localTaskTrackerInfo[ name ],
                                       init=True )

   def updateTaskTrackerTacFields( self, name, infoEntry, init=False ):
      trace( 'updateTaskTrackerTacFields for', name )
      taskTracker = self.clusterStatus.localTaskTrackerInfo[ name ]
      auxTaskTracker = self.auxTaskTrackerInfo[ name ]
      if not auxTaskTracker.isLocal or not taskTracker.intfId:
         if infoEntry.state != NodeState.inactive:
            # Log this message only if the state is changing from active to inactive.
            # Notice that the statusDetail might be changing but we will not log
            # this message until the taskTracker becomes active and then inactive
            # again
            def _getStatusDetailMsg( stateDetail ):
               if stateDetail == NodeStateDetail.resolved:
                  return "Node resolved"
               elif stateDetail == NodeStateDetail.unresolvedDnsName:
                  return "Unresolved name to IP address."
               elif stateDetail == NodeStateDetail.noArpEntry:
                  return "Could not find ARP entry."
               elif stateDetail == NodeStateDetail.unresolvedArpEntry:
                  return "Unresolved ARP entry."
               elif stateDetail == NodeStateDetail.peerArpEntry:
                  return "ARP entry belongs to MLAG peer."
               else:
                  return "Unknown status detail"

            stateDetailMsg = _getStatusDetailMsg( taskTracker.stateDetail )
            Logging.log( HadoopTracerLogMsgs.MAPREDUCEMONITOR_TASKTRACKER_INACTIVE,
                         name, stateDetailMsg )
         infoEntry.state = NodeState.inactive
      else:
         if init or infoEntry.state != NodeState.active:
            # Log a message if the state is changing or if this is the first time
            # we are detecting this taskTracker
            Logging.log( HadoopTracerLogMsgs.MAPREDUCEMONITOR_TASKTRACKER_ACTIVE,
                         name )
         infoEntry.state = NodeState.active

      infoEntry.stateDetail = taskTracker.stateDetail
      infoEntry.shortName = taskTracker.shortName
      infoEntry.ipAddress = taskTracker.ipAddress
      infoEntry.intfId = taskTracker.intfId

      if taskTracker.error:
         infoEntry.error = taskTracker.error

      # Remove old job entries
      delNotInStatusCollection( 'TaskTrackerInfo:runningJobs',
                                infoEntry.runningJob,
                                taskTracker.runningJob )

      for jobId in taskTracker.runningJob:
         if jobId not in infoEntry.runningJob:
            infoEntry.newRunningJob( jobId )
         else:
            # Cleanup all the tasks from the previous interval
            infoEntry.runningJob[ jobId ].runningTask.clear()

         sysdbRunningTask = infoEntry.runningJob[ jobId ].runningTask
         for taskReport in taskTracker.runningJob[ jobId ].runningTask.values():
            sysdbRunningTask.addMember( taskReport )

      sysdbTrace( 'updated taskTracker sysdb', infoEntry.state,
                  infoEntry.shortName, infoEntry.ipAddress, infoEntry.intfId )

      if Tracing.enabled( 8 ):
         for job in infoEntry.runningJob:
            sysdbTrace( 'taskTracker sysdb running job:', job )
            for task in infoEntry.runningJob[ job ].runningTask:
               sysdbTrace( 'taskTracker sysdb tasks:',
                    infoEntry.runningJob[ job ].runningTask[ task ] )

   def setIsLocalJob( self, jobId ):
      job = self.clusterStatus.runningJob.get( jobId )

      if job is not None:
         # The case we will find job None is a race between our HTTP call and the
         # call to retrieve the running jobs.  We will collect the new job we
         # saw via HTTP on our next polling interval.
         job.isLocal = True

   def setNxtEventId( self, jobId, lastEventId ):
      if jobId in self.clusterStatus.runningJob:
         self.clusterStatus.runningJob[ jobId ].nextEventId = lastEventId + 1
         debug( 'jobId', jobId, 'nxtEventId',
                self.clusterStatus.runningJob[ jobId ].nextEventId )
      else:
         debug( 'jobId', jobId, 'completed but getting last completion events' )


   def addJobHistoryCounter( self, jobId, name, intfId, taskCounters ):
      """ Add a JobHistory and the counters to it """
      if jobId not in self.jobHistory:
         self.jobHistory[ jobId ] = {}

      if intfId not in self.jobHistory[ jobId ]:
         self.jobHistory[ jobId ][ intfId ] = CountersPerIntfId( intfId=intfId )

      jobHistory = self.jobHistory[ jobId ][ intfId ]
      jobHistory.counters = addBytes( jobId, intfId, jobHistory.counters,
                                      taskCounters )
      jobHistory.taskTracker[ name ] = True
      debug( 'Adding counters to history of jobId', jobId, 'intfId', intfId,
             self.jobHistory[ jobId ][ intfId ] )

   def addLocalJobCounter( self, jobId, name, intfId, taskCounters,
                           isCompletedBytes ):
      """ Add the current Bytes to the LocalJobCounter. We keep on accumulating
          the bytes until we have to write it into sysdb. """
      runningJob = self.clusterStatus.runningJob[ jobId ]
      if intfId not in runningJob.localJobCounter:
         runningJob.newLocalJobCounter( intfId )

      localJobCounter = runningJob.localJobCounter[ intfId ]
      localJobCounter.taskTracker[ name ] = True
      if isCompletedBytes:
         localJobCounter.accumCompletedBytes = addBytes( jobId, intfId,
                                    localJobCounter.accumCompletedBytes,
                                    taskCounters )
      else:
         localJobCounter.accumBytes = addBytes( jobId, intfId,
                                    localJobCounter.accumBytes,
                                    taskCounters )

      debug( 'counters for', jobId, 'intfId', intfId, 'accumCompletedBytes',
             localJobCounter.accumCompletedBytes, 'accumBytes',
             localJobCounter.accumBytes )


# Main update thread class. There is one thread per cluster.  Each thread polls the
# JobTracker for updates.
class HadoopTracerBaseThread( threading.Thread ):
   def __init__( self, clusterName, agent ) :
      trace( 'HadoopTracerThread.__init__ entry for', clusterName )
      clusterConfig = agent.config.clusterConfig[ clusterName ]
      if not self.isClusterConfigComplete( agent.config, clusterConfig ):
         print( '{}: {}'.format( datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ),
                 'Required parameters not provided to start a HadoopTracerThread' ) )
         os.abort()

      super().__init__()
      self.clusterName = clusterName  # Good for debugging
      self.agent = agent  # used to access sysdb values from arp, etc

      # We copy the clusterConfig since the config might be removed (no on the CLI)
      self.interval = clusterConfig.interval

      self._stop_thread = threading.Event()
      self.prevJobsWithIncompleteTasks = set()
      self.currJobsWithIncompleteTasks = set()
      # Flag to indicate if this is the first data retrieved from the jobTracker
      # since we started the thread activity
      self.firstPoll = True
      trace( 'HadoopTracerThread.__init__ exit for', clusterName )

   def _periodicCleanup( self, tmpData ):
      pass

   def _finalCleanup( self ):
      pass

   @classmethod
   def isClusterConfigComplete( cls, agentCfg, cfg ):
      """ Check if the clusterConfig contains all required parameters """
      raise NotImplementedError()

   def stop( self ):
      trace( 'HadoopTracerThread.stop for', self.clusterName )
      self._stop_thread.set()
      trace( 'HadoopTracerThread.stop exit for', self.clusterName )

   def stopped( self ):
      return self._stop_thread.isSet()

   def run( self ):
      trace( 'HadoopTracerThread.run entry for:', self.clusterName )

      while not self.stopped():
         startTime = time.time()
         tmpData = TmpData( startTime, self.clusterName )
         try:
            trace( 'Starting cycle to getClusterStatus' )
            trackers = self._getClusterStatus( tmpData )
            info( '_getClusterStatus:', tmpData.clusterStatus.clusterInfo )
            self._getRunningJobs( tmpData )
            info( '_getRunningJobs:', len( tmpData.clusterStatus.runningJob ) )
            # Even if there are no runningJob we fill the localTaskTrackers for
            # info on what is connected to the switch.
            self._fillLocalTaskTrackers( trackers, tmpData )
            if tmpData.clusterStatus.runningJob or self.prevJobsWithIncompleteTasks:
               # If there are running Jobs or if we found tasks running locally
               # previously for a job we try to retrieve the task info
               self._getTasks( tmpData )
         except ReceivedStopException as e:
            trace( 'Thread interrupted by stop event:', e )
         except Exception as e:  # pylint: disable-msg=W0703
            error( 'Data collection cycle failed with exception', e )
            traceback.print_exc()
            if not tmpData.clusterStatus.error:
               tmpData.clusterStatus.error = 'Internal error: %s' % e
         if not self.stopped():
            # We just add info on mapreduce if we weren't stopped.  We might
            # have incompleted data if we were stopped while collecting
            # data.
            timeTrace( 'Thread:', threading.current_thread().name,
                       'adding item' )
            tmpData.firstPoll = self.firstPoll
            entry = QueueEntry( self.clusterName, tmpData )
            self.agent.resultQueue.writeData( entry )

         self._periodicCleanup( tmpData )
         if traceHandle.enabled( 7 ):
            timeTrace( 'Time to complete one iteration of polling for cluster:',
                       self.clusterName, ':', time.time() - startTime )
         # Release ref to tmpData and enty, it belong to the Queue from here on
         self.firstPoll = False
         entry = None
         tmpData = None
         self._stop_thread.wait( self.interval )

      self._finalCleanup()
      trace( 'HadoopTracerThread.run exit for', self.clusterName )

   def _getIpAddress( self, name, tmpData ):
      ipAddr = None
      try:
         # GetAddrinfo timeout is not something we can control (os dependent )
         # TODO use threads and retrieve a bunch of addresses in paralell
         addrInfo = socket.getaddrinfo( name, None, socket.AF_INET )
         if addrInfo and len( addrInfo[ 0 ] ) > 4:
            ipAddr = addrInfo[0][4][0]
         else:
            error( "addrInfo doesn't have enough info", addrInfo )
      except OSError as e:
         error( 'cannot resolve', name, e )
         # Put this info on the agent log
         print( '{}: cannot resolve {}:{}'.format(
                datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ), name, e ) )
      return ipAddr

   def _findArpEntry( self, ipAddress, name, allArpIntfStatuses ):
      """ Return NodeStateDetail plus the interface the ipAddress is attached to.
          The interface is None if the ipAddress cannot be resolved. """
      trace( '_findArpEntry entry', name, ipAddress )
      retValue = ( NodeStateDetail.noArpEntry, None )
      for arpIntfStatus in allArpIntfStatuses.arpIntfStatus.values():
         intfName = arpIntfStatus.name
         entry = arpIntfStatus.arpEntry.get( ipAddress )
         if entry:
            if entry.ethAddr == '00:00:00:00:00:00':
               debug( 'hostname', name, 'ipaddress', ipAddress,
                      'intfName', intfName, 'has incomplete ethAddr',
                      entry.ethAddr )
               retValue = ( NodeStateDetail.unresolvedArpEntry, None )
            elif intfName.startswith( 'Vlan' ):
               vlanId = int( intfName[ 4: ] )
               key = Tac.Value( 'Bridging::HostKey', vlanId, entry.ethAddr )
               host = self.agent.smashBridgingStatus.smashFdbStatus.get( key )
               if host:
                  debug( 'hostname', name, 'ipaddress', ipAddress,
                         'vlan', vlanId, 'intfName', host.intf,
                         'has ethAddr', entry.ethAddr )
                  retValue = ( NodeStateDetail.resolved, host.intf )
               else:
                  debug( 'Could not resolve hostname', name, 'ipaddress',
                         ipAddress, 'vlan', vlanId, 'has ethAddr', entry.ethAddr )
                  # One scenario that can lead here is if the link went down
                  # since we had a sysdb entry for the taskTracker we will
                  # mark it inactive
                  retValue = ( NodeStateDetail.unresolvedArpEntry, None )
            else:
               retValue = ( NodeStateDetail.resolved, intfName )
      hasMlag = self.agent.mlagStatus.mlagState in ( 'primary', 'secondary' )
      debug( 'hostname', name, 'ipaddress', ipAddress, 'arp result:', retValue,
             'hasMlag', hasMlag, 'mlagPeerLink',
             self.agent.mlagStatus.peerLinkIntf.intfId if hasMlag else 'N/A' )
      # If this is an mlag topology we might have resolved the entry because we
      # learned the host from our peer.  We don't want to consider the tasktracker
      # local in this case.
      if hasMlag and retValue[ 0 ] == NodeStateDetail.resolved and ( retValue[ 1 ]
            == self.agent.mlagStatus.peerLinkIntf.intfId ):
         debug( 'hostname', name, 'discovered via our peer, discarding entry' )
         retValue = ( NodeStateDetail.peerArpEntry, None )

      return retValue

   def _fillLocalTaskTrackers( self, trackers, tmpData ):
      """ Get the list of task Trackers directly connected to the switch interfaces.
      """
      def _getFullName( tracker ):
         # tracker is a format like:
         # tracker_r12s15-10g.cs1.aristanetworks.com.:.....
         name = tracker.split( ":" )[ 0 ]
         if "_" in name:
            name = name.split( "_" )
            if len( name ) > 1:
               name = name[ 1 ]
            else:
               raise ParsingErrorException( tracker,
                                       'Malformed TaskTracker name: %r' % tracker )
         return name

      for tracker in trackers:
         isLocal = False
         url = None
         intf = None
         stateDetail = NodeStateDetail.unresolvedDnsName
         name = _getFullName( tracker )
         shortName = name.split( "." )[ 0 ]
         ipAddress = self._getIpAddress( name, tmpData )
         if ipAddress:
            ( stateDetail, intf ) = self._findArpEntry( ipAddress, name,
                                                        self.agent.arpStatus )
            isLocal = stateDetail == NodeStateDetail.resolved

            if isLocal:
               url = self._getTaskTrackerUrl( ipAddress )
         tmpData.localTaskTrackerIs( name, shortName, ipAddress, url, isLocal, intf,
                                     stateDetail )

   def _getClusterStatus( self, tmpData ):
      raise NotImplementedError()

   def _getRunningJobs( self, tmpData ):
      raise NotImplementedError()

   def _getTasksForJob( self, jobId, localTaskTrackers, tmpData, mapTasksPerJob,
                        reduceTasksPerJob, completedTasksPerJob ):
      raise NotImplementedError()

   def _getRunningTasks( self, localTaskTrackers, tmpData ):
      raise NotImplementedError()

   def _updateCompletedTasks( self, jobId, completedTasksPerJob, localTaskTrackers,
                              tmpData ):
      raise NotImplementedError()

   def _processTasks( self, jobId, dictTasksPerJob, tmpData, localTaskTrackers, 
                      mapDictTasks ):
      raise NotImplementedError()

   def _addCompletedTask( self, jobId, taskReportKey, name, intf, counters,
                          tmpData ):
      """ Add the completed task bytes to the current completed job bytes or to the
          job history if the job is done. """
      if jobId in tmpData.clusterStatus.runningJob:
         # If the task is completed (even if it was listed on
         # the HTTP) we only add its counters to the runningJob
         debug( 'Adding completed task counters to running Job', taskReportKey,
                jobId, name )
         tmpData.addLocalJobCounter( jobId, name, intf, counters, True )
      else:
         tmpData.addJobHistoryCounter( jobId, name, intf, counters ) 

   def _addHttpTask( self, jobId, name, taskReportKey, auxTaskTracker, 
                     intf, task, tmpData ):
      """ Add the task bytes to the current bytes job counters and to the 
          taskTracker list of TaskReports """
      # If the task is on the list of HTTP tasks and is not
      # completed we add to the TaskTrackerReport
      if jobId not in tmpData.clusterStatus.runningJob:
         # We shouldn't have no running jobs in HTTP, hadoop is
         # going off script
         raise InconsistentHadoopDataException(
               'Job in HTTP but JobTracker is not reporting it: %d' % ( jobId, ) )
      tmpData.updateTaskTrackerReport( name, jobId, taskReportKey,
                                       auxTaskTracker, task )
      debug( 'Adding http task counters to running Job', taskReportKey,
               jobId, name, task )
      tmpData.addLocalJobCounter( jobId, name, intf, task.counters, False )
      self.currJobsWithIncompleteTasks.add( jobId )

   def _getTasks( self, tmpData ):
      """ For each local taskTracker retrieve the list of running jobs/tasks via
          HTTP.
          For each job on the HTTP list or prevJobsWithIncompleteTasks, retrieve
          task counters.
      """
      # dict of map, reduce and completed tasks returned for a given job
      mapTasksPerJob = {}
      reduceTasksPerJob = {}
      completedTasksPerJob = {}
      taskTrackers = tmpData.clusterStatus.localTaskTrackerInfo
      if not taskTrackers:
         info( 'No taskTrackers for cluster', self.clusterName )
         return

      # separate the local taskTrackers to accelerate the loop while processing the
      # tasks.  We add to the localTaskTrackers taskTrackers that used to be local
      # but we lost connection.  That's so we don't miss the completion events for
      # the previous tasks we accounted on the last interval. 
      localTaskTrackers = {}
      sysdbClusterStatus = self.agent.status.clusterStatus[ self.clusterName ]
      sysdbTaskTracker = sysdbClusterStatus.localTaskTrackerInfo
      for name in tmpData.auxTaskTrackerInfo:
         if tmpData.auxTaskTrackerInfo[ name ].isLocal:
            localTaskTrackers[ name ] = (
                  tmpData.clusterStatus.localTaskTrackerInfo[ name ].intfId )
         elif name in sysdbTaskTracker and sysdbTaskTracker[ name ].intfId:
            localTaskTrackers[ name ] = sysdbTaskTracker[ name ].intfId

      if not localTaskTrackers:
         info( 'No localTaskTrackers for cluster', self.clusterName )
         return

      localRunningJobs = set()
      if tmpData.clusterStatus.runningJob:
         localRunningJobs = self._getRunningTasks( localTaskTrackers, tmpData )

      localRunningJobs.update( self.prevJobsWithIncompleteTasks )

      # For each local running job, get task reports, process that job and
      # clear the mapTasksPerJob, reduceTasksPerJob and completedTasksPerJob
      # dictionaries. This way, Task Reports for only one job will be saved in
      # memory (the 3 dictionaries mentioned above) at a time.
      for jobId in localRunningJobs:
         self._getTasksForJob( jobId, localTaskTrackers, tmpData, mapTasksPerJob,
                               reduceTasksPerJob, completedTasksPerJob )

         # process the completedTasksPerJob adding it to the per taskTracker
         # auxTaskTracker completedTask and notCompletedTask dict
         self._updateCompletedTasks( jobId, completedTasksPerJob, localTaskTrackers,
                                     tmpData )

         with TimedExceptionHandler( "process the reduce tasks", self, tmpData ):
            self._processTasks( jobId, reduceTasksPerJob, tmpData, localTaskTrackers,
                                False )

         with TimedExceptionHandler( "process the map tasks", self, tmpData ):
            self._processTasks( jobId, mapTasksPerJob, tmpData, localTaskTrackers,
                                True )

         mapTasksPerJob.clear()
         reduceTasksPerJob.clear()
         completedTasksPerJob.clear()

      debug( 'self.prevJobsWithIncompleteTasks:', self.prevJobsWithIncompleteTasks )
      debug( 'self.currJobsWithIncompleteTasks:', self.currJobsWithIncompleteTasks )
      self.prevJobsWithIncompleteTasks = self.currJobsWithIncompleteTasks
      self.currJobsWithIncompleteTasks = set()

   def _getTaskTrackerUrl( self, ipAddress ):
      return ""

class HadoopTracerMR1Thread( HadoopTracerBaseThread ):
   def __init__( self, clusterName, agent ) :
      trace( 'HadoopTracerMR1Thread.__init__ entry for', clusterName )
      super().__init__( clusterName, agent )
      clusterConfig = agent.config.clusterConfig[ clusterName ]

      # We copy the clusterConfig since the config might be removed (no on the CLI)
      self.host = clusterConfig.host
      self.rpcPort = clusterConfig.rpcPort
      self.httpPort = clusterConfig.httpPort
      self.user = clusterConfig.user
      self.jt = JobTrackerClient( host=self.host, port=self.rpcPort, user=self.user,
                                  timeout=RPC_TIMEOUT )
      trace( 'HadoopTracerMR1Thread.__init__ exit for', clusterName )

   def _finalCleanup( self ):
      if self.jt:
         # Close connection to JobTracker
         self.jt.close()
         self.jt = None

   @classmethod
   def isClusterConfigComplete( cls, agentCfg, cfg ):
      """ Check if the clusterConfig contains all required parameters """
      return ( agentCfg.enabled and cfg.enabled and cfg.host and
               cfg.rpcPort and cfg.httpPort and cfg.user and cfg.interval )

   def _getClusterStatus( self, tmpData ):
      with TimedExceptionHandler( "get the cluster status", self, tmpData ):
         clusterStatus = self.jt.getClusterStatus( detailed=True )
         rpcTrace( 'clusterStatus:', clusterStatus )

      tmpData.clusterInfoIs( ClusterInfo(
                  status=chkTacTypeEnum( Status, clusterStatus.status, tmpData ),
                  activeTrackers=len( clusterStatus.trackers ),
                  blackListedTrackers=len( clusterStatus.blacklisted ),
                  decommissionedNodes=clusterStatus.excludedNodes,
                  trackerExpireInterval=clusterStatus.trackerExpiryInterval,
                  runningMapTasks=clusterStatus.mapTasksRunning,
                  runningReduceTasks=clusterStatus.reduceTasksRunning,
                  totalMapSlots=clusterStatus.mapSlots,
                  totalReduceSlots=clusterStatus.reduceSlots,
                  jobTrackerHeapSize=clusterStatus.jtHeapSize,
                  jobTrackerMaxHeapSize=clusterStatus.jtMaxHeapSize ) )

      return clusterStatus.trackers

   def _getRunningJobs( self, tmpData ):
      """ Update the tmpData.JobInfo with running jobs """
      with TimedExceptionHandler( "get the running jobs", self, tmpData ):
         runningJobs = self.jt.jobsToComplete()
         rpcTrace( 'jobsToComplete:', runningJobs )

      jobList = []
      for job in runningJobs:
         with TimedExceptionHandler( "get the job profile %s" % job.jobid,
                                     self, tmpData ):
            profile = self.jt.getJobProfile( job.jobid )
            rpcTrace( 'getJobProfile:', profile )
         info( 'Jobs to complete:', job.jobid, ' name:', profile.name )
         jobId = JobKey( job.jobid.jobid, job.jobid.jtid )
         newJob = JobInfo( jobId,
                           name=profile.name,
                           user=job.user,
                           startTime=milliSecToSec( job.startTime ),
                           mapProgress=job.mapProgress,
                           reduceProgress=job.reduceProgress,
                           priority=chkTacTypeEnum( Priority, job.priority,
                                                    tmpData ),
                           runState=chkTacTypeEnum( RunState, job.runState,
                                                    tmpData ),
                           cleanupProgress=job.cleanupProgress,
                           setupProgress=job.setupProgress,
                           failureInfo=job.failureInfo,
                           schedulingInfo=job.schedulingInfo,
                           queueName=profile.queueName,
                           url=profile.url )
         jobList.append( newJob )
      tmpData.runningJobIs( jobList )

   def _getRunningTasks( self, localTaskTrackers, tmpData ):
      localRunningJobs = set()
      for name in localTaskTrackers:
         if not tmpData.auxTaskTrackerInfo[ name ].isLocal:
            # Just http local taskTrackers.
            continue
         taskTracker = tmpData.clusterStatus.localTaskTrackerInfo[ name ]
         auxTaskTracker = tmpData.auxTaskTrackerInfo[ name ]
         try:
            with TimedExceptionHandler( 'fetch for {}:{}'.format( name,
                                        auxTaskTracker.url ),
                                        self, tmpData ):
               # pylint: disable-next=consider-using-with
               rawpage = urllib.request.urlopen( auxTaskTracker.url,
                                              timeout=HTTP_FETCH_TIMEOUT ).read()
               if not rawpage:
                  raise HttpParsingErrorException( auxTaskTracker.url,
                        "Couldn't get any data back" )
               page = rawpage.decode()
         except urllib.error.URLError:
            # We continue extracting data from other trackers. The
            # TimedExceptionHandler populates tmpData.clusterStatus.error
            # which is a global error condition. We log HTTP timeouts per
            # taskTracker
            taskTracker.error = tmpData.clusterStatus.error
            tmpData.clusterStatus.error = ''
            continue

         with TimedExceptionHandler( 'http parse %s'
                                   % auxTaskTracker.url, self, tmpData ):
            self._parseHttpPage( page, auxTaskTracker, name, tmpData )

         localRunningJobs.update( set( auxTaskTracker.httpTasks.keys() ) )

      return localRunningJobs

   def _parseHttpPage( self, page, auxTaskTracker, trackerName, tmpData ):
      """ Parse the http page for the given taskTracker and return on the
          auxTaskTracker.runningJobs dict the list of running jobs and start
          filling the list of taskReports on it. """
      def _searchGroup1( rexp, data, url ):
         outData = re.search( rexp, data, re.DOTALL )
         if outData:
            return outData.group( 1 )
         else:
            error( 'Cannot find regexp:', rexp, 'in', data )
            errorMsg = 'Cannot find regexp:%s in data' % rexp
            raise HttpParsingErrorException( url, errorMsg )
      httpTrace( 'Parsing page:', page )
      # run will hold data from <table>..</table>
      run = _searchGroup1( '<h2>Running tasks</h2>.*?<center>(.*?)</center>', page,
                           auxTaskTracker.url )
      rowData = _searchGroup1( '</tr>(.*?)</table>', run, auxTaskTracker.url )
      # rows has all rows for running tasks
      rows = re.findall( '<tr>(.*?)</tr>', rowData, re.DOTALL )
      for row in rows:
         column = _searchGroup1( '<td>(.*?)</td>', row, auxTaskTracker.url )
         # data will now hold attempt_<jtid>_<jobid>_<m/r>_<taskid>_<attemptid>
         data = column.split( '_' )
         if len( data ) > 5:
            jtId = int( data[ 1 ] )
            jobId = int( data[ 2 ] )
            isMap = 'm' in data[ 3 ]
            taskId = int( data[ 4 ] )
            attemptId = int( data[ 5 ] )
            jobId = Tac.const( JobKey( jobId, str( jtId ) ) )
            if not jobId in auxTaskTracker.httpTasks:
               auxTaskTracker.newRunningJob( jobId )
            tmpData.setIsLocalJob( jobId )
            key = TaskReportKey( taskId, attemptId, isMap )
            auxTaskTracker.newHttpTaskReport( jobId, key )
         else:
            raise HttpParsingErrorException( auxTaskTracker.url,
                                             'Bad taskId name %s' % data )

   def _getTasksForJob( self, jobId, localTaskTrackers, tmpData, mapTasksPerJob,
                        reduceTasksPerJob, completedTasksPerJob ):
      """ Get the map/reduce and completed tasks for the jobId """
      def _getEventId( jobId ):
         # Returns the eventId to start collecting completed tasks.
         eventId = 0
         status = self.agent.status.clusterStatus[ self.clusterName ]
         if jobId in status.runningJob:
            eventId = status.runningJob[ jobId ].nextEventId
         debug( 'Next event id', eventId, 'for jobId', jobId )
         return eventId

      jtJobId = tacJobKey2JobId( jobId )
      if jobId not in completedTasksPerJob:
         with TimedExceptionHandler( 'update completed tasks for %r' % jobId, self,
                                     tmpData ):
            eventId = _getEventId( jobId )
            completedTasksPerJob[ jobId ] = self.jt.getTaskCompletionEvents(
                                            jobid=jtJobId, fromEventId=eventId )
            rpcTrace( 'completedTasks:', completedTasksPerJob )
      if jobId not in mapTasksPerJob:
         with TimedExceptionHandler( 'update map tasks for %r' % jobId, self,
                                     tmpData ):
            mapTasksPerJob[ jobId ] = self.jt.getMapTaskReports( jtJobId )
            rpcTrace( 'mapTasks:', mapTasksPerJob )
            if traceHandle.enabled( 7 ):
               timeTrace( 'number of mapTasks:', len( mapTasksPerJob[ jobId ] ) )


      if jobId not in reduceTasksPerJob:
         with TimedExceptionHandler( 'update reduce tasks for %r' % jobId, self,
                                     tmpData ):
            reduceTasksPerJob[ jobId ] = self.jt.getReduceTaskReports( jtJobId )
            rpcTrace( 'reduceTasks', reduceTasksPerJob )
            if traceHandle.enabled( 7 ):
               timeTrace( 'number of reduceTasks:', 
                          len( reduceTasksPerJob[ jobId ] ) )

   def _updateCompletedTasks( self, jobId, completedTasksPerJob, localTaskTrackers,
                              tmpData ):
      """ For each completedTask, verifies if it ran on a local task tracker and
          update the auxTaskTrackerInfo completedTask or killedTask dict according
          to the completedEvent status. """
      def _parseTtName( taskTrackerHttp ):
         # taskTrackerHttp is like http://r12s8-10g.cs1.aristanetworks.com.:10060
         name = re.search( 'http://(.*):.*', taskTrackerHttp, re.DOTALL )
         if name:
            name = name.group( 1 )
         else:
            errorMsg = 'Cannot find taskTracker name in %s' % taskTrackerHttp
            error( errorMsg )
            raise ParsingErrorException( taskTrackerHttp, errorMsg )
         return name

      if jobId in completedTasksPerJob:
         lastEventId = None
         debug( 'Processing completedTask For job', jobId )
         for event in completedTasksPerJob[ jobId ]:
            debug( 'Processing event', event )
            lastEventId = event.eventId
            if event.isMap != event.taskid.taskid.ismap:
               # We don't have a nice way of figuring out if this is a setup task
               # so use the discrepancy that Hadoop have between the ismap field
               # as a hint that this is not the task we are looking for.
               continue

            # If task got killed because TT node was removed then we may have no
            # http and status may be 'killed'. So, just ignore that event as the
            # tasks on this TT will get rescheduled on other ones and bytes will
            # be counter correctly
            if not event.taskTrackerHttp and event.status == "KILLED":
               debug( "Ignoring killed event with null taskTrackerHttp: %r" %
                      event )
               continue

            trackerName = _parseTtName( event.taskTrackerHttp )
            if trackerName in localTaskTrackers:
               auxTaskTracker = tmpData.auxTaskTrackerInfo[ trackerName ]
               taskReportKey = taskReportKey2Tac( event.taskid.attemptid,
                                                  event.taskid.taskid )
               # remove the task from http, we will have this when the task
               # completed between the HTTP and the getCompletionEvents RPC call
               if taskReportKey in auxTaskTracker.httpTasks.get( jobId, {} ):
                  auxTaskTracker.httpTasks[ jobId ].remove( taskReportKey )
                  debug( 'Http task removed', jobId, taskReportKey )

               if event.status == 'SUCCEEDED':
                  auxTaskTracker.newCompletedTask( jobId, taskReportKey )
                  debug( 'add completed event for',
                         trackerName, jobId, taskReportKey )
               else:
                  auxTaskTracker.newNotCompletedTask( jobId, taskReportKey )
                  debug( 'add not completed event for',
                         trackerName, jobId, taskReportKey )

         if lastEventId is not None:
            tmpData.setNxtEventId( jobId, lastEventId )

   def _processTasks( self, jobId, dictTasksPerJob, tmpData, localTaskTrackers, 
                      mapDictTasks ):
      """ Go over the completed tasks and http tasks for this interval and get its
          counters from the map or reduce tasks per job.
      """
      def _sameAttemptId( taskReportKey, taskReport ):
         """ Check if the taskReportKey.attemptId exists in the attempts rpc list
             We special case when there is not attempts and the attemptId is 
             0.
         """
         retValue = False
         if not taskReport.attempts and taskReportKey.attemptId == 0:
            retValue = True
         else:
            for attempt in taskReport.attempts:
               if attempt.attemptid == taskReportKey.attemptId:
                  retValue = True
                  break
         return retValue

      def _shouldProcessTask( task, jobId, taskReportKey ):
         """ Check if the task has attempts, if it has just flag so we can get its
           counters when the task completes. Returns true if the task doesn't have
           attempts """
         if len( task.attempts ) == 1 or not task.attempts:
            if _sameAttemptId( taskReportKey, task ):
               retValue = True
            else:
               debug( 'Not same attemptId', taskReportKey, task )
               retValue = False
               # This is a race on the speculative task where the taskTracker
               # hasn't checkpointed with jobtracker yet.  Use the speculative
               # treatment of making sure we get completed events
               self.currJobsWithIncompleteTasks.add( jobId )
         else:
            # Until we implement a treatment for speculative tasks we don't
            # do anything about it other than at least guarantee we will
            # retrieve the completed event for it.
            self.currJobsWithIncompleteTasks.add( jobId )
            retValue = False
         return retValue

      def _sameTaskType( taskId, mapDictTasks ):
         """ Compare if the task is of same type as the dictionary of tasks:
             if the dictionary is of map task reports the taskId should be a map
             taskReport """
         return taskId.ismap == mapDictTasks

      trace( '_processTasks entry, jobId:', jobId )
      maxTime = 0
      curSum = 0
      curCount = 0
      # pylint: disable=too-many-nested-blocks
      for name, intf in localTaskTrackers.items():
         auxTaskTracker = tmpData.auxTaskTrackerInfo[ name ]
         if jobId in auxTaskTracker.completedTasks:
            if jobId not in dictTasksPerJob:
               # If there is a completedTask for the job we should have taskReports 
               # for the job
               error( 'Completed task without RPC jobId', jobId )
               continue
            for taskReportKey in auxTaskTracker.completedTasks[ jobId ]:
               start = time.time()
               taskId = taskIdFromTaskReportKey( jobId, taskReportKey )
               # Compare if the task type is the same: map with map or reduce with
               # reduce
               if _sameTaskType( taskId, mapDictTasks ):
                  if taskId in dictTasksPerJob[ jobId ]:
                     if _sameAttemptId( taskReportKey, 
                                        dictTasksPerJob[ jobId ][ taskId ] ):
                        self._addCompletedTask( jobId, taskReportKey, name, intf,
                                     dictTasksPerJob[ jobId ][ taskId ].counters,
                                     tmpData )
                     else:
                        error( 'Completed task with different attemptId', 
                               taskReportKey, dictTasksPerJob[ jobId ][ taskId ] )
                                
                  else:
                     error( 'Completed task without RPC taskId', jobId , taskId )
                     debug( 'taskId not in dictTasksPerJob:',
                            dictTasksPerJob[ jobId ] )
               
               if traceHandle.enabled( 7 ):
                  ( maxTime, curSum, curCount ) = profileTime( maxTime, curSum,
                                                               curCount, start )
         if jobId in auxTaskTracker.httpTasks:
            if jobId not in dictTasksPerJob:
               error( 'Task in HTTP but could not get jobid from RPC', jobId,
                    taskId )
               continue
            for taskReportKey in auxTaskTracker.httpTasks[ jobId ]:
               start = time.time()
               taskId = taskIdFromTaskReportKey( jobId, taskReportKey )
               # Compare if the task type is the same: map with map or reduce with
               # reduce
               if _sameTaskType( taskId, mapDictTasks ):
                  # For http we don't look at tasks with multiple attempts. 
                  # We account for them when it completes
                  if taskId in dictTasksPerJob[ jobId ] and _shouldProcessTask( 
                        dictTasksPerJob[ jobId ][ taskId ], jobId, taskReportKey ):
                     self._addHttpTask( jobId, name, taskReportKey, auxTaskTracker,
                                        intf, dictTasksPerJob[ jobId ][ taskId ],
                                        tmpData )
                  elif taskId not in dictTasksPerJob[ jobId ]:
                     debug( 'HTTP task without RPC taskId', jobId , taskId )
                     debug( 'taskId not in dictTasksPerJob:',
                           dictTasksPerJob[ jobId ] )
                      
               if traceHandle.enabled( 7 ):
                  ( maxTime, curSum, curCount ) = profileTime( maxTime, curSum,
                                                               curCount, start )
      if traceHandle.enabled( 7 ):
         timeTrace( 'For JobId:', jobId, 'Avg time per task',
                    curSum / curCount if curCount else 0,
                    'Max time', maxTime, 'number of tasks', curCount )
      trace( '_processTasks exit, jobId:', jobId )

   def _getTaskTrackerUrl( self, ipAddress ):
      # Make the taskTracker URL from http port and standard format.
      # This would be "http://<ipAddress>:<port>/tasktracker.jsp"
      return f"http://{ipAddress}:{self.httpPort}/tasktracker.jsp"

class HadoopTracerMR2Thread( HadoopTracerBaseThread ):
   def __init__( self, clusterName, agent ) :
      trace( 'HadoopTracerMR2Thread.__init__ entry for', clusterName )
      super().__init__( clusterName, agent )
      clusterConfig = agent.config.clusterConfig[ clusterName ]

      # We copy the clusterConfig since the config might be removed (no on the CLI)
      self.resourceManagerHost = clusterConfig.resourceManagerHost
      self.resourceManagerPort = clusterConfig.resourceManagerPort
      self.jobHistoryServerHost = clusterConfig.jobHistoryHost
      self.jobHistoryServerPort = clusterConfig.jobHistoryPort
      self.yarnClient = YarnClient( name=self.clusterName,
                                    rmHost=self.resourceManagerHost,
                                    rmPort=self.resourceManagerPort,
                                    jhHost=self.jobHistoryServerHost,
                                    jhPort=self.jobHistoryServerPort )
      self.yarnClient.enableCache()
      self.prevCompletedTasksPerJob = defaultdict( set )
      self.currCompletedTasksPerJob = defaultdict( set )
      trace( 'HadoopTracerThread.__init__ exit for', clusterName )

   def _periodicCleanup( self, tmpData ):
      self.yarnClient.clearCache()
      if not tmpData.clusterStatus.error:
         self.prevCompletedTasksPerJob = self.currCompletedTasksPerJob.copy()
         self.currCompletedTasksPerJob.clear()
         debug( 'prevCompletedTasksPerJob:', self.prevCompletedTasksPerJob )

   @classmethod
   def isClusterConfigComplete( cls, agentCfg, cfg ):
      """ Check if the clusterConfig contains all required parameters """
      return ( agentCfg.enabled and cfg.enabled and cfg.resourceManagerHost and
               cfg.resourceManagerPort and cfg.jobHistoryHost and cfg.jobHistoryPort
               and cfg.interval )

   def _getClusterStatus( self, tmpData ):
      with TimedExceptionHandler( "get the cluster status", self, tmpData ):
         clusterStatus = self.yarnClient.getClusterStatus()
         rpcTrace( 'clusterStatus:', clusterStatus )

      tmpData.clusterInfoIs( ClusterInfo(
                  status=chkTacTypeEnum( Status, clusterStatus.state, tmpData ),
                  activeTrackers=len( clusterStatus.activeNodes ),
                  decommissionedNodes=clusterStatus.decommissionedNodes,
                  runningMapTasks=clusterStatus.mapTasksRunning,
                  runningReduceTasks=clusterStatus.reduceTasksRunning,
                  allocatedMB=clusterStatus.allocatedMB,
                  totalMB=clusterStatus.totalMB
                  ) )

      return clusterStatus.activeNodes

   def _getRunningJobs( self, tmpData ):
      """ Update the tmpData.JobInfo with running jobs """
      with TimedExceptionHandler( "get the running jobs", self, tmpData ):
         runningJobs = self.yarnClient.getRunningJobs()
         rpcTrace( 'runningJobs:', runningJobs )

      jobList = []
      for job in runningJobs:
         # jobid is of format job_<jtid>_<jobid>
         data = job.jobid.split( '_' )
         if len( data ) < 3:
            raise ParsingErrorException( job.jobid,
                                         'Malformed Job Id: %r' % job.jobid )
         jtId = data[ 1 ]
         jId = int( data[ 2 ] )
         jobId = JobKey( jId, jtId )

         failureInfo = ""
         try:
            # diagnostics is unicode
            failureInfo = str( job.diagnostics )
         except Exception as e: # pylint: disable-msg=broad-except
            warn( "ignoring: hit an exception converting unicode to string:", e )

         newJob = JobInfo( jobId,
                           name=job.name,
                           user=job.user,
                           startTime=milliSecToSec( job.startTime ),
                           mapProgress=job.mapProgress,
                           reduceProgress=job.reduceProgress,
                           runState=chkTacTypeEnum( RunState, job.state,
                                                    tmpData ),
                           failureInfo=failureInfo,
                           queueName=job.queueName,
                           url=job.url )
         jobList.append( newJob )

      tmpData.runningJobIs( jobList )

   def _getRunningTasks( self, localTaskTrackers, tmpData ):
      with TimedExceptionHandler( "get the running tasks", self, tmpData ):
         runningTaskAttempts = self.yarnClient.getRunningTaskAttempts()
         rpcTrace( 'runningTaskAttempts:', runningTaskAttempts )

      localRunningJobs = set()
      for taskAttempt in runningTaskAttempts:
         # nodeHttpAddress is of format <node>:<port>
         nodeName = taskAttempt.nodeHttpAddress.split( ':' )[ 0 ] if \
               taskAttempt.nodeHttpAddress else None
         if ( not nodeName or nodeName not in localTaskTrackers or
              not tmpData.auxTaskTrackerInfo[ nodeName ].isLocal ):
            # Just process local nodes.
            continue

         auxTaskTracker = tmpData.auxTaskTrackerInfo[ nodeName ]

         # attemptid is of format:
         # attempt_<jtid>_<jobid>_<m/r>_<taskid>_<attemptId>
         data = taskAttempt.attemptid.split( '_' )
         if len( data ) < 6:
            raise ParsingErrorException( taskAttempt.attemptid,
                                         'Malformed TaskAttempt Id: %r' %
                                         taskAttempt.attemptid )
         jtId = data[ 1 ]
         jId = int( data[ 2 ] )
         isMap = 'm' in data[ 3 ]
         taskId = int( data[ 4 ] )
         attemptId = int( data[ 5 ] )
         jobId = Tac.const( JobKey( jId, jtId ) )

         localRunningJobs.add( jobId )
         if not jobId in auxTaskTracker.httpTasks:
            auxTaskTracker.newRunningJob( jobId )
         tmpData.setIsLocalJob( jobId )
         key = TaskReportKey( taskId, attemptId, isMap )
         auxTaskTracker.newHttpTaskReport( jobId, key )

      return localRunningJobs

   def _getTasksForJob( self, jobId, localTaskTrackers, tmpData, mapTasksPerJob,
                        reduceTasksPerJob, completedTasksPerJob ):
      """ Get the map/reduce and completed tasks for the jobId in Yarn """
      trace( "_getTasksForJob for jobId:", jobId )
      yarnJobId = 'job_%s_%04d' % ( jobId.jtId, jobId.jobId )

      # mapTasksPerJob and reduceTasksPerJob are dictionaries indexed by jobId.
      # Fill mapTasksPerJob[jobId] and reduceTasksPerJob[jobId] with dictionaries
      # of TaskAttemptStatus for all "map" and "reduce" task attempts for jobId,
      # indexed by TaskReportKey.
      if jobId not in mapTasksPerJob or jobId not in reduceTasksPerJob:
         mapTaskAttempts = {}
         reduceTaskAttempts = {}
         with TimedExceptionHandler( 'get task attempts for %r' % jobId, self,
                                     tmpData ):
            taskAttempts = self.yarnClient.getTaskAttempts( yarnJobId )
            rpcTrace( 'taskAttempts:', taskAttempts )
            if traceHandle.enabled( 7 ):
               timeTrace( 'number of taskAttempts:', len( taskAttempts ) )

         if not taskAttempts:
            info( 'No TaskAttempts for Job Id: %r' % yarnJobId )
            return

         taskAttemptsForCounters = []
         for taskAttempt in taskAttempts:
            # attemptid is of format:
            # attempt_<jtid>_<jobid>_<m/r>_<taskid>_<attemptId>
            data = taskAttempt.attemptid.split( '_' )
            if len( data ) < 6:
               raise ParsingErrorException( taskAttempt.attemptid,
                                            'Malformed TaskAttempt Id: %r' %
                                            taskAttempt.attemptid )
            isMap = 'm' in data[ 3 ]
            tId = int( data[ 4 ] )
            attemptId = int( data[ 5 ] )
            taskReportKey = Tac.const( TaskReportKey( tId, attemptId, isMap ) )

            if taskAttempt.state == 'pending':
               # If there are pending task attempts, add this job to
               # currJobsWithIncompleteTasks set so that we process this job in
               # in the next interval. Add job only if job was known to be local
               # job in previous poll. This way, we do not add jobs with only
               # pending tasks on non-local nodes.
               if jobId in self.prevJobsWithIncompleteTasks:
                  self.currJobsWithIncompleteTasks.add( jobId )

               # No need to save and process task attempts that are pending.
               debug( 'Not saving TaskAttemptStatus for', taskAttempt.attemptid,
                      'since it is in', taskAttempt.state, 'state' )
               continue

            # No need to save and process task attempts that did not run on a local
            # node. nodeHttpAddress is of format <node>:<port>
            nodeName = taskAttempt.nodeHttpAddress.split( ':' )[ 0 ] if \
                  taskAttempt.nodeHttpAddress else None
            if ( not nodeName or nodeName not in localTaskTrackers ):
               debug( 'Not saving TaskAttemptStatus for', taskAttempt.attemptid,
                      'since its node', nodeName, 'is not a local node' )
               continue

            # If there are running task attempts on local nodes, add this job to
            # currJobsWithIncompleteTasks so that we process this job in the
            # next interval.
            if taskAttempt.state == 'running':
               self.currJobsWithIncompleteTasks.add( jobId )

            # self.currCompletedTasksPerJob is a dictionary indexed by jobId.
            # Update self.currCompletedTasksPerJob[jobId] with a set of
            # TaskReport keys for all completed task attempts as of now.
            if taskAttempt.state in ( 'complete', 'failed', 'killed' ):
               self.currCompletedTasksPerJob[ jobId ].add( taskReportKey )

            # No need to save and process completed task attempts that
            # were already processed in previous polls.
            if taskReportKey in self.prevCompletedTasksPerJob.get( jobId, {} ):
               debug( 'Not saving TaskAttemptStatus for', taskAttempt.attemptid,
                      'in state:', taskAttempt.state, 'since it was already '
                      'processed as a completed task in previous polls' )
               continue

            if isMap:
               mapTaskAttempts[ taskReportKey ] = taskAttempt
            else:
               reduceTaskAttempts[ taskReportKey ] = taskAttempt

            # Need to get counters for this task attempt.
            taskAttemptsForCounters.append( taskAttempt )

         # Get counters for all task attempts of interest.
         attemptIds = [ attempt.attemptid for attempt in taskAttemptsForCounters ]
         with TimedExceptionHandler( 'get the task attempt counters for %r' %
                                     jobId, self, tmpData ):
            attemptCounters = self.yarnClient.getTaskAttemptCounters( attemptIds )
            rpcTrace( 'attempIds:', attemptIds, 'attemptCounters:', attemptCounters )
            if traceHandle.enabled( 7 ):
               timeTrace( 'number of taskAttempt counters:', len( attemptCounters ) )

         for index, attempt in enumerate( taskAttemptsForCounters ):
            attempt.counters = attemptCounters[ index ]

         debug( 'mapTaskAttempts:', mapTaskAttempts )
         debug( 'reduceTaskAttempts:', reduceTaskAttempts )

         mapTasksPerJob[ jobId ] = mapTaskAttempts
         reduceTasksPerJob[ jobId ] = reduceTaskAttempts

      debug( 'self.currCompletedTasksPerJob:', self.currCompletedTasksPerJob )
      debug( 'self.prevCompletedTasksPerJob:', self.prevCompletedTasksPerJob )

      # completedTasksPerJob is a dictionary indexed by jobId.
      # Update completedTasksPerJob[jobId] with dictionary of TaskAttemptStatus
      # for all task attempts that completed since the last poll, indexed by
      # TaskReportKey.
      if ( jobId in self.currCompletedTasksPerJob and
           jobId not in completedTasksPerJob ):
         completedTasks = ( self.currCompletedTasksPerJob[ jobId ] -
                            self.prevCompletedTasksPerJob[ jobId ] )
         completedTasksPerJob[ jobId ] = {}
         for taskReportKey in completedTasks:
            trace( 'Newly completed task in this interval, taskReportKey:',
                   taskReportKey )
            if taskReportKey.isMap:
               completedTasksPerJob[ jobId ][ taskReportKey ] = \
                     mapTasksPerJob[ jobId ][ taskReportKey ]
            else:
               completedTasksPerJob[ jobId ][ taskReportKey ] = \
                     reduceTasksPerJob[ jobId ][ taskReportKey ]

      debug( 'mapTasksPerJob:', mapTasksPerJob )
      debug( 'reduceTasksPerJob:', reduceTasksPerJob )
      debug( 'completedTasksPerJob:', completedTasksPerJob )

   def _updateCompletedTasks( self, jobId, completedTasksPerJob, localTaskTrackers,
                              tmpData ):
      """ For each completedTask, verifies if it ran on a local task tracker and
          update the auxTaskTrackerInfo completedTask or killedTask dict according
          to the completedTask status. """
      def _parseNodeName( nodeHttpAddress ):
         # nodeHttpAddress is of format r12s8-10g.cs1.aristanetworks.com:10060
         name = re.search( '(.*):.*', nodeHttpAddress, re.DOTALL )
         if name:
            name = name.group( 1 )
         else:
            errorMsg = 'Cannot find taskTracker name in %s' % nodeHttpAddress
            error( errorMsg )
            raise ParsingErrorException( nodeHttpAddress, errorMsg )
         return name

      # completedTasksPerJob is a dictionary indexed by jobId.
      # completedTasksPerJob[jobId] is a dictionary of TaskAttemptStatus
      # for all task attempts that completed since the last poll, indexed by
      # TaskReportKey.
      if jobId in completedTasksPerJob:
         debug( 'Processing completedTask For job', jobId )
         for taskReportKey in completedTasksPerJob[ jobId ]:
            debug( 'Processing taskReportKey', taskReportKey )
            attemptStatus = completedTasksPerJob[ jobId ][ taskReportKey ]
            debug( 'attemptStatus ', attemptStatus )
            trackerName = _parseNodeName( attemptStatus.nodeHttpAddress )
            if trackerName in localTaskTrackers:
               auxTaskTracker = tmpData.auxTaskTrackerInfo[ trackerName ]
               # remove the task from http, we will have this when the task
               # completed between _getRunningTasks() and _getTasksForJob().
               if taskReportKey in auxTaskTracker.httpTasks.get( jobId, {} ):
                  auxTaskTracker.httpTasks[ jobId ].remove( taskReportKey )
                  debug( 'Http task removed', jobId, taskReportKey )

               auxTaskTracker.newCompletedTask( jobId, taskReportKey )
               debug( 'add completed event for', trackerName, jobId, taskReportKey )

   def _processTasks( self, jobId, dictTasksPerJob, tmpData, localTaskTrackers, 
                      mapDictTasks ):
      """ Go over the completed tasks and http tasks for this interval and get its
          counters from the map or reduce tasks per job.
      """
      trace( '_processTasks entry, jobId:', jobId )
      maxTime = 0
      curSum = 0
      curCount = 0
      # pylint: disable=too-many-nested-blocks
      for name, intf in localTaskTrackers.items():
         auxTaskTracker = tmpData.auxTaskTrackerInfo[ name ]
         if jobId in auxTaskTracker.completedTasks:
            if jobId not in dictTasksPerJob:
               # If there is a completedTask for the job we should have taskReports 
               # for the job
               error( 'Found completed task for jobId %s, but jobId is not in '
                      'dictTasksPerJob: %s' % ( jobId, dictTasksPerJob ) )
               continue
            for taskReportKey in auxTaskTracker.completedTasks[ jobId ]:
               start = time.time()
               # Compare if the task type is the same: map with map or reduce with
               # reduce
               if taskReportKey.isMap == mapDictTasks:
                  if taskReportKey in dictTasksPerJob[ jobId ]:
                     self._addCompletedTask( jobId, taskReportKey, name, intf,
                           dictTasksPerJob[ jobId ][ taskReportKey ].counters,
                           tmpData )
                  else:
                     error( 'Found completed task for jobId %s taskReportKey %s, '
                            'but taskReportKey is not in dictTasksPerJob[jobId]: %s'
                            % ( jobId, taskReportKey, dictTasksPerJob[ jobId ] ) )
               
               if traceHandle.enabled( 7 ):
                  ( maxTime, curSum, curCount ) = profileTime( maxTime, curSum,
                                                               curCount, start )
         if jobId in auxTaskTracker.httpTasks:
            if jobId not in dictTasksPerJob:
               error( 'Found running task for jobId %s, but jobId is not in '
                      'dictTasksPerJob: %s' % ( jobId, dictTasksPerJob ) )
               continue
            for taskReportKey in auxTaskTracker.httpTasks[ jobId ]:
               start = time.time()
               # Compare if the task type is the same: map with map or reduce with
               # reduce
               if taskReportKey.isMap == mapDictTasks:
                  if taskReportKey in dictTasksPerJob[ jobId ]:
                     # If task attempt state is not running, then the task must have
                     # completed after the call to getRunningTaskAttempts().
                     # Don't process such tasks as running tasks.
                     if dictTasksPerJob[ jobId ][ taskReportKey ].state != 'running':
                        trace( 'Task for jobId %s taskId %s appears in both '
                               'running and completed task lists. Considering the '
                               'task as completed' % ( jobId, taskReportKey ) )
                        continue
                     self._addHttpTask( jobId, name, taskReportKey, auxTaskTracker,
                           intf, dictTasksPerJob[ jobId ][ taskReportKey ], tmpData )
                  else:
                     error( 'Found running task for jobId %s taskReportKey %s, '
                            'but taskReportKey is not in dictTasksPerJob[jobId]: %s'
                            % ( jobId, taskReportKey, dictTasksPerJob[ jobId ] ) )
                      
               if traceHandle.enabled( 7 ):
                  ( maxTime, curSum, curCount ) = profileTime( maxTime, curSum,
                                                               curCount, start )
      if traceHandle.enabled( 7 ):
         timeTrace( 'For JobId:', jobId, 'Avg time per task',
                    curSum / curCount if curCount else 0,
                    'Max time', maxTime, 'number of tasks', curCount )
      trace( '_processTasks exit, jobId:', jobId )

class ResultQueue( Tac.File ):
   """ This class implements a Notifiee that is activated every time a HadoopTracer
       thread put data in the Queue.  """
   MAX_QUEUE_ENTRIES = 100

   def __init__( self, cb ):
      ( self.rpipe, self.wpipe ) = os.pipe()
      super().__init__( self.rpipe,
                        readHandler=self.readQueueData )
      # Queue used to receive data from HadoopTracer threads
      self.resultQueue = queue.Queue( self.MAX_QUEUE_ENTRIES )
      self.cb = cb

   def readQueueData( self, data ):
      trace( 'ResultQueue.readQueueData entry' )
      debug( 'readQueue with:', self.resultQueue.qsize() )
      try:
         while not self.resultQueue.empty():
            item = self.resultQueue.get_nowait()
            debug( 'item:', item.clusterName )
            self.cb( item )
      except Exception: # pylint: disable-msg=W0703
         # This shouldn't happen so something is out of control, better restart
         # the agent
         print( '{}: {}'.format( datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ),
            'Aborting HadoopTracer agent due to unexpected cond processing queue' ) )
         traceback.print_exc()
         os.abort()

      trace( 'ResultQueue.readQueueData exit' )

   def writeData( self, queueEntry ):
      trace( 'ResultQueue.writeData entry', queueEntry.clusterName )
      try:
         self.resultQueue.put_nowait( queueEntry )
         # Write after putting item on the queue to avoid race
         os.write( self.wpipe, b'd' )
      except queue.Full:
         # TODO, should we log a message instead so we have a record of this?
         debug( 'Dropping message from cluster', queueEntry.clusterName )

      trace( 'ResultQueue.writeData exit', queueEntry.clusterName )

#
# The HadoopTracer agent takes care of config reactors and start and stop the threads
# to poll the Hadoop Cluster.
#
class HadoopTracer( Agent.Agent ):
   """ The HadoopTracer agent. """
   def __init__( self, entityManager ):
      """ Anchors the config reactors """
      trace( 'HadoopTracer.__init__ entry' )
      self.config = None
      self.status = None
      self.cmdRequest = None
      self.arpStatus = None
      self.arpSmash = None
      self.arpSmashVrfIdMap = None
      self.arpShimSm = None
      self.smashBridgingStatus = None
      self.mlagStatus = None
      self.configReactor_ = None
      self.clusterConfigReactor_ = None
      self.clusterCmdRequestReactor_ = None
      self.warm_ = False
      self.threads_ = {} # Dictionary containing polling threads per cluster
      self.resultQueue = ResultQueue( self.updateMapReduceData )
      # Check on thread status every self.cleanupInterval
      self.cleanupThreadClock = Tac.ClockNotifiee()
      self.cleanupThreadClock.handler = self.cleanupThread
      self.cleanupThreadClock.timeMin = Tac.endOfTime
      self.cleanupInterval = 5 # 5 sec

      super().__init__( entityManager )
      trace( 'HadoopTracer.__init__ exit' )

   def _addCurBurst( self, curBurst, intfId, jobId, jobName, curAccumBytes,
                     sysdbAccumBytes, taskTrackers ):
      """ Calculate the current burst (curAccumBytes - prevAccumBytes ) and add
          to the curBurst dict for further topBurstCounter processing """
      if curBurst is not None:
         if intfId not in curBurst:
            curBurst[ intfId ] = []
         debug( 'Values for curBurst', intfId, jobId, curAccumBytes, 
                sysdbAccumBytes )
         inBytes = ( curAccumBytes.hdfsBytesRead +
                     curAccumBytes.reduceShuffleBytes  -
                     sysdbAccumBytes.hdfsBytesRead - 
                     sysdbAccumBytes.reduceShuffleBytes ) 
         outBytes = ( curAccumBytes.hdfsBytesWritten - 
                      sysdbAccumBytes.hdfsBytesWritten )
         curBurst[ intfId ].append( BurstCountersInfo( jobId=jobId, jobName=jobName,
                                                       inBytes=inBytes, 
                                                       outBytes=outBytes ) )
         debug( 'Added curBurst', intfId, jobId, outBytes, inBytes, taskTrackers )

   def _processBurstCounters( self, status, curBurst, lastUpdate, arrayMaxSize ):
      """ Iterate over the current job burst counters and insert them in the 
          topBurstCounter arrays if they are greater than burst we saw before.
      """
      def _replaceSysdbBurstArray( newBurstList, isInCounter, topBurst ):
         """ Replace the sysdb topBurstArray with the new burstArray """
         idx = 0
         for burstCounter in newBurstList:
            nxtIdx = idx + 1
            if isInCounter:
               topBurst.burstInCounter[ idx ] = burstCounter
               topBurst.numBurstInElements = nxtIdx 
            else:
               topBurst.burstOutCounter[ idx ] = burstCounter
               topBurst.numBurstOutElements = nxtIdx
            idx = nxtIdx

      def _propagateList( burstArray, numElem, newList ):
         for idx in burstArray:
            if idx < numElem: 
               newList.append( burstArray[ idx ] ) 
            else:
               break

      trace( '_processBurstCounters for', curBurst, 'entry' )
      for intfId, burstInfoList in curBurst.items():
         if intfId not in status.topBurstCounter:
            debug( 'Creating new topBurstCounter', intfId )
            status.newTopBurstCounter( intfId )
         topBurst = status.topBurstCounter[ intfId ]
         newBurstOutList = [] 
         newOutValue = False
         newBurstInList  = []
         newInValue = False
         _propagateList( topBurst.burstOutCounter, topBurst.numBurstOutElements,
                         newBurstOutList )
         _propagateList( topBurst.burstInCounter, topBurst.numBurstInElements,
                         newBurstInList )
         debug( 'Processing burst counter for', intfId )
         if Tracing.enabled( 4 ):
            for burst in newBurstOutList:
               debug( 'outBurst:', burst )
            for burst in newBurstInList:
               debug( 'inBurst:', burst )
         
         for burstInfo in burstInfoList:
            if self._checkInsertBurstCounter( newBurstInList, arrayMaxSize,
                                              burstInfo, lastUpdate, True ):
               newInValue = True

            if self._checkInsertBurstCounter( newBurstOutList, arrayMaxSize, 
                                              burstInfo, lastUpdate, False ):
               newOutValue = True

         if newInValue:
            _replaceSysdbBurstArray( newBurstInList, True, topBurst )
         if newOutValue:
            _replaceSysdbBurstArray( newBurstOutList, False, topBurst )

      if Tracing.enabled( 8 ):
         sysdbTrace( 'Number of intfs with burstCounters', 
                     len( status.topBurstCounter ) )
         for intfId in status.topBurstCounter:
            sysdbTrace( 'Burst counters for intf', intfId )
            topBurstCounter = status.topBurstCounter[ intfId ]
            for idx in range( 0, topBurstCounter.numBurstInElements ):
               sysdbTrace( 'In Burst counter:', 
                        topBurstCounter.burstInCounter[ idx ] )
            for idx in range( 0, topBurstCounter.numBurstOutElements ):
               sysdbTrace( 'Out Burst counter:', 
                        topBurstCounter.burstOutCounter[ idx ] )

   def _insertBurstCounter( self, newBurstCounter, burstArray, numElem, isIn, maxIdx
                            ):
      """ Insert the burst counter on the TopBurstCounter array """
      tmpArray = []
      inserted = False
      debug( 'Inserting', newBurstCounter, 'in', 
             'burstInCounter' if isIn else 'burstOutCounter', len( burstArray ) )
      for i in range( 0, numElem ):  
         burstCounter = burstArray[ i ] 
         if newBurstCounter.value >= burstCounter.value:
            inserted = True
            tmpArray.append( newBurstCounter )
            tmpArray.append( burstCounter )
         else:
            tmpArray.append( burstCounter )

      if not inserted:
         tmpArray.append( newBurstCounter )

      idx = 0
      for burstCounter in tmpArray:
         burstArray[ idx ] = burstCounter
         idx = idx + 1
         if idx > maxIdx:
            return idx
      return idx

   def _checkInsertBurstCounter( self, burstList, arrayMaxSize, burstInfo, 
                                 lastUpdate, isIn ):
      """ Fill out the newBurstInArray if the burst is higher than previous 
          bursts. """
      def _createBurstCounter( curBytes, burstInfo, lastUpdate ):
         burstCounter = BurstCounter( value=curBytes, jobKey=burstInfo.jobId, 
                              jobName=burstInfo.jobName, timestamp=lastUpdate )
         return burstCounter

      if isIn:
         curBytes = burstInfo.inBytes 
      else:
         curBytes = burstInfo.outBytes

      inserted = False
      listLen = len( burstList )
      # We insert on the list if the curBytes is > 0 and we don't have enough
      # elements in the list yet or the value is greater or equal to the lowest
      # element in the list
      if curBytes > 0 and ( ( burstList and curBytes >= burstList[ - 1 ].value ) or 
            listLen < arrayMaxSize ):
         burstCounter = _createBurstCounter( curBytes, burstInfo, lastUpdate )
         debug( 'Inserting new ', 'inBytes' if isIn else 'outBytes', 'burstCounter',
                burstCounter ) 
         burstList.append( burstCounter )
         burstList.sort( key=attrgetter( 'value', 'timestamp' ), reverse=True )
         if listLen + 1 > arrayMaxSize:
            # Trim the list to keep the insertion fast
            debug( 'Removing from the burstList', burstList[ listLen ] )
            del burstList[ listLen ]
         inserted = True

      return inserted

   def _updateJobHistory( self, jobId, job, tmpDataJobHistory, updateTime, status,
                          maxQueueSize, curBurst ):
      """ Add the job to the jobHistory getting rid of the last jobHistory
          if the list is already over clusterConfig jobHistoryQueueSize """
      trace( '_updateJobHistory entry for', jobId )
      jobHistory = JobHistory( jobId,
                               jobName=job.jobInfo.name,
                               user=job.jobInfo.user,
                               startTime=job.jobInfo.startTime,
                               endTime=updateTime )
      # pylint: disable=too-many-nested-blocks
      for intfId, sysdbLocalJobCounter in job.localJobCounter.items():
         if jobId in tmpDataJobHistory and intfId in tmpDataJobHistory[ jobId ]:
            # Need to add last completion bytes
            counters = addBytes( jobId, intfId,
                                 tmpDataJobHistory[ jobId ][ intfId ].counters,
                                 sysdbLocalJobCounter.accumCompletedBytes )
         else:
            if ( sysdbLocalJobCounter.accumCompletedBytes != 
                 sysdbLocalJobCounter.accumBytes ):
               # We should have got completedBytes if the accum and accumCompleted
               # weren't the same.  Something is messed up in our logic, will be
               # good to debug...
               error( 'Completed bytes should be the same as accumBytes for',
                     jobId, 'completed:', sysdbLocalJobCounter.accumCompletedBytes,
                     'accum:', sysdbLocalJobCounter.accumBytes, 'intf', intfId )
               if traceHandle.enabled( 0 ):
                  error( 'Previous taskReports' )
                  for taskTracker in status.localTaskTrackerInfo.values():
                     if taskTracker.intfId == intfId:
                        for runningJob in taskTracker.runningJob:
                           if runningJob == jobId:
                              for task in ( taskTracker.runningJob[
                                    runningJob ] ).values():
                                 error( task )
                              break
                        break

            counters = sysdbLocalJobCounter.accumCompletedBytes
            debug( 'Adding accumBytes for', intfId, jobId, counters ) 
         
         self._addCurBurst( curBurst, intfId, jobId, job.jobInfo.name,
                            counters, sysdbLocalJobCounter.accumBytes,
                            sysdbLocalJobCounter.taskTracker )

         counterPerIntf = CountersPerIntfId( intfId=intfId, counters=counters )
         for taskTracker in sysdbLocalJobCounter.taskTracker:
            counterPerIntf.taskTracker[ taskTracker ] = True
         jobHistory.countersPerIntfId[ intfId ] = counterPerIntf

      if jobId in tmpDataJobHistory:
         for intfId in tmpDataJobHistory[ jobId ]:
            if intfId not in job.localJobCounter:
               # This is a new interface that is not present in Sysdb JobStatus.
               # Add counters to job history.
               counterPerIntf = tmpDataJobHistory[ jobId ][ intfId ]
               self._addCurBurst( curBurst, intfId, jobId, job.jobInfo.name,
                                  counterPerIntf.counters, Counters(),
                                  counterPerIntf.taskTracker )
               jobHistory.countersPerIntfId[ intfId ] = counterPerIntf

      while len( status.jobHistory ) >= maxQueueSize:
         # Remove the oldest job from history
         status.jobHistory.deq()

      status.jobHistory.enq( jobHistory )
      if Tracing.enabled( 8 ):
         for historicalJob in status.jobHistory.values():
            sysdbTrace( 'job:', historicalJob )
            for intfCounters in historicalJob.countersPerIntfId.values():
               sysdbTrace( 'Job Counters:', intfCounters )
               for taskTracker in intfCounters.taskTracker:
                  sysdbTrace( 'Job TaskTrackers:', taskTracker )

   def _updateJobsAndCounters( self, item ):
      ''' Update the runningJob getting rid of old entries and updating
          existing or adding new entries.
          Update the jobHistory with local jobs that just ended.
          Update the topBurstCounter if this interval produced higher bursts
          counters on certain interfaces. '''
      def _addAccumBytes( counters1, counters2, counters3=None ):
         if not counters3:
            counters3 = Counters()

         result = Counters()
         result.hdfsBytesRead = ( counters1.hdfsBytesRead + counters2.hdfsBytesRead +
                                  counters3.hdfsBytesRead )
         result.hdfsBytesWritten = ( counters1.hdfsBytesWritten +
                                     counters2.hdfsBytesWritten +
                                     counters3.hdfsBytesWritten )
         result.reduceShuffleBytes = ( counters1.reduceShuffleBytes +
                                       counters2.reduceShuffleBytes +
                                       counters3.reduceShuffleBytes )
         return result

      status = self.status.clusterStatus[ item.clusterName ]
      curBurst = None 
      if not item.data.firstPoll:
         # CurBurst is a dict indexed by interfaces of the current jobs and its
         # burst (dict of a list of BurstCountersInfo) for this interval if this is
         # not the first polling interval.
         curBurst = {} 
      for jobId, job in status.runningJob.items():
         if job.isLocal and jobId not in item.data.clusterStatus.runningJob:
            self._updateJobHistory( jobId, job, item.data.jobHistory,
                  item.data.lastUpdate, status,
                  self.config.clusterConfig[ item.clusterName ].jobHistoryMaxQSize,
                  curBurst )
      delNotInStatusCollection( 'runningJob', status.runningJob,
                                item.data.clusterStatus.runningJob )

      for jobId, tmpJobStatus in item.data.clusterStatus.runningJob.items():
         if not jobId in status.runningJob:
            status.newRunningJob( jobId )
         jobStatus = status.runningJob[ jobId ]
         jobStatus.jobInfo = tmpJobStatus.jobInfo
         if tmpJobStatus.isLocal:
            # We just want to change this once if we find the job is local.  Just
            # one HTTP task found on a local taskTracker on the job is sufficient
            # to classify it as local.
            jobStatus.isLocal = True
         # If there were no completed events on the last interval the
         # tmpJobStatus coming from the agent will be 0.
         # Don't update sysdb in this case so we keep where we left.
         if tmpJobStatus.nextEventId:
            debug( 'Updating nextEventId with:', tmpJobStatus.nextEventId )
            jobStatus.nextEventId = tmpJobStatus.nextEventId

         sysdbTrace( 'Job', jobId, 'jobInfo', jobStatus.jobInfo, 'isLocal',
                jobStatus.isLocal, 'nextEventId', jobStatus.nextEventId )
         for intfId, tmpCounters in tmpJobStatus.localJobCounter.items():
            if intfId not in jobStatus.localJobCounter:
               jobStatus.newLocalJobCounter( intfId )
            statusCounters = jobStatus.localJobCounter[ intfId ]
            for taskTracker in tmpCounters.taskTracker:
               statusCounters.taskTracker[ taskTracker ] = True
            accumBytes = _addAccumBytes( statusCounters.accumCompletedBytes,
                                         tmpCounters.accumBytes,
                                         tmpCounters.accumCompletedBytes )
            statusCounters.accumCompletedBytes = _addAccumBytes(
                                              statusCounters.accumCompletedBytes,
                                              tmpCounters.accumCompletedBytes )
            self._addCurBurst( curBurst, intfId, jobId, jobStatus.jobInfo.name,
                               accumBytes, statusCounters.accumBytes,
                               tmpCounters.taskTracker )
            statusCounters.accumBytes = accumBytes
            sysdbTrace( 'Modifying counters for', jobId, 'intfId', intfId,
                  'Tmp accumBytes:', tmpCounters.accumBytes,
                  'Status accumBytes:', statusCounters.accumBytes,
                  'Tmp accumCompletedBytes:', tmpCounters.accumCompletedBytes,
                  'Status accumCompletedBytes:', statusCounters.accumCompletedBytes )
      if curBurst is not None:
         self._processBurstCounters( status, curBurst, item.data.lastUpdate,
             self.config.clusterConfig[ item.clusterName ].topBurstCounterMaxSize ) 
      else:
         debug( 'Not updating current burst bytes, firstPoll' )

   def _updateLocalTaskTrackerInfo( self, item ):
      ''' Update localTaskTrackers info.  Get rid of taskTrackers that are no
          longer active '''
      trace( 'Updating localTaskTrackers for', item.clusterName )
      status = self.status.clusterStatus[ item.clusterName ]

      # Remove taskTrackers that are no longer announced by the JobTracker
      for taskTrackerName in status.localTaskTrackerInfo:
         if taskTrackerName not in item.data.clusterStatus.localTaskTrackerInfo:
            sysdbTrace( 'removing localTaskTrackerInfo', taskTrackerName )
            Logging.log( HadoopTracerLogMsgs.MAPREDUCEMONITOR_TASKTRACKER_REMOVED,
                         taskTrackerName, item.clusterName )
            del status.localTaskTrackerInfo[ taskTrackerName ]

      for name in item.data.clusterStatus.localTaskTrackerInfo:
         auxTracker = item.data.auxTaskTrackerInfo[ name ]
         if name in status.localTaskTrackerInfo:
            item.data.updateTaskTrackerTacFields( name,
                                                status.localTaskTrackerInfo[ name ] )
         elif auxTracker.isLocal:
            # This is a new local taskTracker reported by jobtracker
            item.data.newTaskTrackerInfo( status, name )

   def updateMapReduceData( self, item ):
      ''' Update the mapReduceData obtained from the thread communicating
          with the JobTracker '''
      trace( 'HadoopTracer.updateMapReduceData entry' )
      # We need all the tests on the self.status, operState, etc because it
      # is possible that we just processed an update from a config attrlog
      # that modified the status.
      if  ( item.clusterName in self.status.clusterStatus and
           self.status.clusterStatus[ item.clusterName ].operState ):
         sysdbTrace( 'updating clusterInfo in sysdb', item.clusterName )
         if not item.data.clusterStatus.error:
            # Update the other fields only if not error otherwise keep the staled
            # values.
            self.status.clusterStatus[ item.clusterName ].error = ''
            status = self.status.clusterStatus[ item.clusterName ]
            status.clusterInfo = item.data.clusterStatus.clusterInfo
            self._updateJobsAndCounters( item )
            self._updateLocalTaskTrackerInfo( item )
            self.status.clusterStatus[ item.clusterName ].lastUpdate = (
                 item.data.lastUpdate )
         else:
            self.status.clusterStatus[ item.clusterName ].error = (
               item.data.clusterStatus.error )
            self.status.clusterStatus[ item.clusterName ].lastErrorUpdate = (
                 item.data.lastUpdate )
      else:
         sysdbTrace( 'dropping update for', item.clusterName, 'status:',
                     self.status )
         if item.clusterName in self.status.clusterStatus:
            sysdbTrace( 'operstate:',
                   self.status.clusterStatus[ item.clusterName ].operState )
         else:
            sysdbTrace( 'no status for clusterName' )

      trace( 'HadoopTracer.updateMapReduceData exit' )

   def doInit( self, entityManager ):
      """ The doInit is called by the Agent base when it is safe to begin
          initialization. """
      trace( 'HadoopTracer.doInit entry sysname', entityManager.sysname() )
      mountGroup = entityManager.mountGroup()
      self.config = mountGroup.mount( 'hadooptracer/config',
                                      'HadoopTracer::Config',
                                      'r' )
      self.status = mountGroup.mount( 'hadooptracer/status',
                                      'HadoopTracer::Status',
                                      'w' )
      self.cmdRequest = mountGroup.mount( 'hadooptracer/cmdrequest',
                                       'HadoopTracer::CmdRequest',
                                       'r' )

      shmemEm = SharedMem.entityManager( sysdbEm=entityManager )
      smi = Smash.mountInfo( 'reader' )
      self.smashBridgingStatus = shmemEm.doMount( 'bridging/status',
                                                  'Smash::Bridging::Status', smi )

      # Create an arp smash shim, only for v4 entries and default vrf.
      self.arpSmash = shmemEm.doMount( 'arp/status', 'Arp::Table::Status',
                                       Smash.mountInfo( 'keyshadow' ) )
      self.arpSmashVrfIdMap = shmemEm.doMount( 'vrf/vrfIdMapStatus',
                                               'Vrf::VrfIdMap::Status',
                                               Smash.mountInfo( 'shadow' ) )
      # Local instance of arp status, populated by the arpShimSm
      self.arpStatus = Tac.newInstance( 'Arp::NeighborDir', 'arpStatus' )
      self.arpShimSm = Tac.newInstance( 'Arp::ArpShimSm',
                                        self.arpSmash,
                                        self.arpSmashVrfIdMap,
                                        DEFAULT_VRF_NAME,
                                        True, False,
                                        self.arpStatus,
                                        None )

      # Mount mlag/status, Mlag::Status and its dependent paths
      self.mlagStatus = MlagMountHelper.mountMlagStatus( mountGroup )

      mountGroup.close( self._mountDone )
      trace( 'HadoopTracer.doInit exit' )

   def _mountDone( self ):
      """The _mountDone is called when the mount group close is complete. """
      trace( 'HadoopTracer._mountDone entry' )
      trace( 'instantiating server configuration reactors' )
      self.configReactor_ = ConfigReactor( self.config, self )
      self.clusterConfigReactor_ = Tac.collectionChangeReactor(
                                     self.config.clusterConfig,
                                     ClusterConfigReactor,
                                     reactorArgs=( self, ) )
      self.clusterCmdRequestReactor_ = Tac.collectionChangeReactor(
                                      self.cmdRequest.clusterCmdRequest,
                                      ClusterCmdRequestReactor,
                                      reactorArgs=( self, ) )
      self.warm_ = True
      self.syncStatus()
      self.handleEnabled()
      trace( 'HadoopTracer._mountDone exit' )

   def syncStatus( self ):
      """ Delete status of config that was removed while the agent was gone.
          Cleanup the status record of clusters that stil have a clusterConfig. """
      trace( 'HadoopTracer.syncStatus entry' )

      # This loop uses keys since we are deleting clusterStatus so we cannot
      # iterate on it.
      for clusterName in self.status.clusterStatus:
         if clusterName not in self.config.clusterConfig:
            debug( 'Cleaning up old status for', clusterName )
            del self.status.clusterStatus[ clusterName ]
         else:
            self.cleanupStatus( clusterName,
                                self.status.clusterStatus[ clusterName ] )

      trace( 'HadoopTracer.syncStatus exit' )

   def warm( self ):
      """ The warm method is called by the agent base to see if the agent is
          ready """
      return self.warm_

   def maybeStartThread( self, clusterName ):
      """ If cluster polling is enabled and everything is configured, start the
          polling thread """
      trace( 'HadoopTracer.maybeStartThread entry for', clusterName )
      cfg = self.config.clusterConfig[ clusterName ]

      if cfg.mapReduceVersion == 'MR1':
         if not HadoopTracerMR1Thread.isClusterConfigComplete( self.config,
                                                                   cfg ):
            trace( 'HadoopTracer.maybeStartThread not start exit for', clusterName )
            return
         HadoopTracerThreadClass = HadoopTracerMR1Thread
      elif cfg.mapReduceVersion == 'MR2':
         if not HadoopTracerMR2Thread.isClusterConfigComplete( self.config, cfg ):
            trace( 'HadoopTracer.maybeStartThread not start exit for', clusterName )
            return
         HadoopTracerThreadClass = HadoopTracerMR2Thread
      else:
         error( 'Unknown MapReduce version in config', cfg.mapReduceVersion )
         return

      if clusterName not in self.threads_:
         self.threads_[ clusterName ] = HadoopTracerThreadClass( clusterName, self )
         if clusterName not in self.status.clusterStatus:
            self.status.newClusterStatus( clusterName )
         # If we are starting a new thread we should have stopped the previous
         # one and should have cleanup the status
         if self.status.clusterStatus[ clusterName ].lastUpdate:
            print( '{}: {}'.format( datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ),
                'Cluster not clean even though we are starting a new thread' ) )
            print( self.config.clusterConfig[ clusterName ] )
            print( self.status.clusterStatus[ clusterName ] )
            os.abort()

         self.threads_[ clusterName ].start()
         self.status.clusterStatus[ clusterName ].operState = True
         # Start monitoring this thread
         self.cleanupThreadClock.timeMin = Tac.now() + self.cleanupInterval

      trace( 'HadoopTracer.maybeStartThread exit for', clusterName )

   def cleanupThread( self ):
      """ Check on the liveness of the threads in self.threads_.  If the thread is
          not alive remove from self.threads_ and update the cluster operState.
      """
      trace( 'HadoopTracer.cleanupThread entry' )
      for clusterName in list( self.threads_ ):
         if self.threads_[ clusterName ].is_alive():
            # if there are alive threads keeps this Clock monitoring on
            debug( 'Thread alive:', clusterName )
            self.cleanupThreadClock.timeMin = Tac.now() + self.cleanupInterval
         else:
            debug( 'removing updating thread for', clusterName )
            del self.threads_[ clusterName ]
            if clusterName not in self.config.clusterConfig:
               debug( 'deleting status from clusterstatus', clusterName )
               del self.status.clusterStatus[ clusterName ]
            else:
               if clusterName in self.status.clusterStatus:
                  self.status.clusterStatus[ clusterName ].operState = False
                  self.cleanupStatus( clusterName,
                                self.status.clusterStatus[ clusterName ] )
               if self.config.clusterConfig[ clusterName ].enabled:
                  # We probably tried to start the cluster before the thread
                  # finished so let's start it now
                  debug( 'starting delayed', clusterName )
                  self.maybeStartThread( clusterName )
      if not self.config.enabled and not any( self.threads_ ):
         # If all the threads have stopped and the config is disabled
         # set our running status to False so the launcher will stop us
         trace( "Setting running status to False" )
         self.status.running = False
      trace( 'HadoopTracer.cleanupThread exit' )

   def stopThread( self, clusterName ):
      trace( 'HadoopTracer.stopThread entry for', clusterName )
      if clusterName in self.threads_:
         debug( 'Stopping thread for', clusterName )
         self.threads_[ clusterName ].stop()
         self.cleanupThreadClock.timeMin = Tac.now()
      trace( 'HadoopTracer.stopThread exit for', clusterName )

   def cleanupStatus( self, clusterName, clusterStatus ):
      """ If the cluster exists at the sysdb clusterStatus cleanup all the fields
          but the jobHistory """
      trace( 'HadoopTracer.cleanupStatus entry for', clusterName )
      clusterStatus = self.status.clusterStatus[ clusterName ]
      clusterStatus.lastUpdate = 0
      clusterStatus.error = ''
      clusterStatus.lastErrorUpdate = 0
      clusterStatus.clusterInfo = ClusterInfo()
      clusterStatus.localTaskTrackerInfo.clear()
      clusterStatus.attachedIntf.clear()
      clusterStatus.runningJob.clear()
      clusterStatus.topBurstCounter.clear()
      trace( 'HadoopTracer.cleanupStatus exit for', clusterName )

   def handleClusterConfig( self, clusterName ):
      """ Start/Stop/Stop-Start hadoopTracer polling thread on the particular
          clusterName with its config """
      trace( 'HadoopTracer.handleClusterConfig entry for', clusterName )
      if clusterName in self.threads_:
         self.stopThread( clusterName )
      else:
         if self.config.clusterConfig[ clusterName ].enabled:
            self.maybeStartThread( clusterName )

      trace( 'HadoopTracer.handleClusterConfig exit for', clusterName )

   def handleClearJobHistory( self, clusterName ):
      trace( 'HadoopTracer.handleClearJobHistory entry for', clusterName )
      if clusterName in self.status.clusterStatus:
         debug( 'Clearing job history for', clusterName )
         self.status.clusterStatus[ clusterName ].jobHistory.clear()
      trace( 'HadoopTracer.handleClearJobHistory exit for', clusterName )

   def handleClearTopBurstCounters( self, clusterName ):
      trace( 'HadoopTracer.handleTopBurstCounter entry for', clusterName )
      if clusterName in self.status.clusterStatus:
         debug( 'Clearing top burst counter for', clusterName )
         self.status.clusterStatus[ clusterName ].topBurstCounter.clear()
      trace( 'HadoopTracer.handleTopBurstCounter exit for', clusterName )

   def startPollingThreads( self ):
      """ Go over the clusterConfig entries and maybe starts a thread for every
          enabled cluster.  Threads are started only if all config params are
          available """
      trace( 'HadoopTracer.startPollingThreads entry' )
      for clusterName in self.config.clusterConfig:
         if self.config.clusterConfig[ clusterName ].enabled:
            self.maybeStartThread( clusterName )

      trace( 'HadoopTracer.startPollingThreads exit' )

   def stopPollingThreads( self ):
      """ Stop all polling threads """
      trace( 'HadoopTracer.stopPollingThreads entry' )
      # Send a stop to each thread.  We don't wait on the thread to stop here to
      # avoid heartbeat issues.  We remove from the threads_ collection
      # in the cleanupThread routine which is called periodically.
      for clusterName in list( self.threads_ ):
         self.stopThread( clusterName )
      trace( 'HadoopTracer.stopPollingThreads exit' )

   def handleEnabled( self ):
      trace( 'HadoopTracer.enabled entry' )
      if self.config.enabled:
         self.startPollingThreads()
         self.status.running = True
      else:
         self.stopPollingThreads()
         self.cleanupThreadClock.timeMin = Tac.now()
      trace( 'HadoopTracer.enabled exit' )

class ConfigReactor( Tac.Notifiee ):
   """ Adjusts daemon status in response to config changes. """
   notifierTypeName = 'HadoopTracer::Config'

   def __init__( self, config, agent ):
      trace( 'ConfigReactor __init__ entry')
      self.agent = agent
      super().__init__( config )
      trace( 'ConfigReactor __init__ exit')

   @Tac.handler( 'enabled' )
   def handleEnabled( self ):
      trace( 'ConfigReactor.handleEnabled entry')
      debug( 'thread handleEnabled:', threading.current_thread().name )
      self.agent.handleEnabled()
      trace( 'ConfigReactor.handleEnabled exit')

   @Tac.handler( 'clusterConfig' )
   def handleClusterConfig( self, name=None ):
      trace( 'ConfigReactor.handleClusterConfig entry for', name )
      if name in self.agent.config.clusterConfig:
         debug( 'New clusterConfig:', name )
      else:
         debug( 'Deleted clusterConfig:', name )
         self.agent.stopThread( name )
         del self.agent.status.clusterStatus[ name ]
      trace( 'ConfigReactor.handleClusterConfig exit for', name )

class ClusterConfigReactor( Tac.Notifiee ):
   notifierTypeName = 'HadoopTracer::ClusterConfig'
   def __init__( self, entity, agent ):
      trace( 'ClusterConfigReactor __init__ for', entity.name )
      self.agent = agent
      self.clusterName_ = entity.name
      super().__init__( entity )
      if self.clusterName_ in self.agent.status.clusterStatus:
         debug( 'ClusterName already in status', self.clusterName_ )
      else:
         debug( 'adding new ClusterName to status', self.clusterName_ )
         self.agent.status.newClusterStatus( self.clusterName_ )
      trace( 'ClusterConfigReactor __init__ exit for', entity.name )

   @Tac.handler( 'host' )
   def handleHost( self ):
      trace( 'ClusterConfigReactor.handleHost entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleHost exit' )

   @Tac.handler( 'rpcPort' )
   def handleRpcPort( self ):
      trace( 'ClusterConfigReactor.handleRpcPort entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleRpcPort exit' )

   @Tac.handler( 'httpPort' )
   def handleHttpPort( self ):
      trace( 'ClusterConfigReactor.handleHttpPort entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleHttpPort exit' )

   @Tac.handler( 'user' )
   def handleUser( self ):
      trace( 'ClusterConfigReactor.handleUser entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleUser exit' )

   @Tac.handler( 'resourceManagerHost' )
   def handleResourceManagerHost( self ):
      trace( 'ClusterConfigReactor.handleResourceManagerHost entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleResourceManagerHost exit' )

   @Tac.handler( 'resourceManagerPort' )
   def handleResourceManagerPort( self ):
      trace( 'ClusterConfigReactor.handleResourceManagerPort entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleResourceManagerPort exit' )

   @Tac.handler( 'jobHistoryHost' )
   def handleJobHistoryHost( self ):
      trace( 'ClusterConfigReactor.handleJobHistoryHost entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleJobHistoryHost exit' )

   @Tac.handler( 'jobHistoryPort' )
   def handleJobHistoryPort( self ):
      trace( 'ClusterConfigReactor.handleJobHistoryPort entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleJobHistoryPort exit' )

   @Tac.handler( 'interval' )
   def handleInterval( self ):
      trace( 'ClusterConfigReactor.handleInterval entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleInterval exit' )

   @Tac.handler( 'enabled' )
   def handleEnabled( self ):
      trace( 'ClusterConfigReactor.handleEnabled entry' )
      self.agent.handleClusterConfig( self.clusterName_ )
      trace( 'ClusterConfigReactor.handleEnabled exit' )

class ClusterCmdRequestReactor( Tac.Notifiee ):
   notifierTypeName = 'HadoopTracer::ClusterCmdRequest'
   def __init__( self, entity, agent ):
      trace( 'ClusterCmdRequestReactor __init__ for', entity.name )
      self.agent = agent
      self.clusterName_ = entity.name
      super().__init__( entity )
      trace( 'ClusterCmdRequestReactor __init__ exit for', entity.name )

   @Tac.handler( 'clearJobHistory' )
   def handleClearJobHistory( self ):
      self.agent.handleClearJobHistory( self.clusterName_ )

   @Tac.handler( 'clearBurstCounters' )
   def handleClearTopBurstCounter( self ):
      self.agent.handleClearTopBurstCounters( self.clusterName_ )

def main():
   trace( 'Creating agent HadoopTracer container' )
   container = Agent.AgentContainer( [ HadoopTracer ] )
   trace( 'Running agent' )
   container.runAgents()
