# Copyright (c) 2016 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.

# pylint: disable=consider-using-f-string

import builtins
import ctypes
from datetime import datetime
import errno
import fcntl
import os
import select
import signal
import socket
import tempfile
import threading
import time
import traceback
import sys
import gc

import Agent
import BasicCli
import BasicCliSession
import Syscall
import Cli
import CliApi
import CliArgParser
import CliCommon
import CliPatchOs
import CliPatchPdb
import CliPatchSigint
import ConfigAgentName
import CliShellLib
import CliSubprocMgr
import CliThreadCtrl # pylint: disable=import-error
import Excepthook
import FastServUtil
import JsonCli
import Logging
import PyServer
import QuickTrace
import SecMonUtil
import Tac
import ThreadLocalEnv
import ThreadLocalFile
import Tracing
import _Tac
import ArSlabPython
from CliPlugin.CliCliModel import AgentMemoryUsage

# pylint: disable=c-extension-no-member

traceHandle = Tracing.Handle( 'CliServerLib' )
log = traceHandle.trace0
warn = traceHandle.trace1
info = traceHandle.trace2
trace = traceHandle.trace3
debug = traceHandle.trace4

qt0 = QuickTrace.trace0
qv = QuickTrace.Var

ACTVITY_THREAD_MONITOR_TIMEOUT = 5
CONNECTION_BACKLOG = 128
SELECT_TIMEOUT = 5

libc = ctypes.cdll[ 'libc.so.6' ]

SYS_LOCK_CONTENTION = Logging.LogHandle( "SYS_LOCK_CONTENTION",
   severity=Logging.logNotice,
   fmt="Unable to acquire Activity lock in %ss while running command '%s'.",
   explanation=( "Agent was slow to aquire activity lock." ),
   recommendedAction=Logging.NO_ACTION_REQUIRED )

# -----------------------------------------------------------------------------------
# Some functions used for testing and changing behavior of ConfigAgent to simulate
# errors

# This flag is used in certain tests to emulate non-responding ConfigAgent
connectionsDisabled_ = False
builtinPrint_ = builtins.print

def mockPrint( *args, **kwargs ):
   raise OSError( 'mock' )

def setPrintRaisesError():
   # all print calls will return an OSError which simulates a full file system
   builtins.print = mockPrint

def unsetPrintRaisesError():
   builtins.print = builtinPrint_
# -----------------------------------------------------------------------------------

class CliShellContext:
   def __init__( self ):
      self.stdin_ = None
      self.stdout_ = None
      self.stderr_ = None
      self.ttyType_ = None
      self.cliShellArgs_ = None
      self.options_ = None
      self.env_ = None
      self.aaaUser_ = None
      self.ctty_ = None
      self.cliInputSock_ = None
      self.cliInput_ = None
      self.requestSock_ = None
      self.jsonRpcStatisticsSock_ = None

   def stdin( self ):
      return self.stdin_

   def stdout( self ):
      return self.stdout_

   def stderr( self ):
      return self.stderr_

   def ttyType( self ):
      return self.ttyType_

   def cliShellArgs( self ):
      return self.cliShellArgs_

   def options( self ):
      return self.options_

   def env( self ):
      return self.env_

   def aaaUser( self ):
      return self.aaaUser_

   def ctty( self ):
      return self.ctty_

   def cliInput( self ):
      return self.cliInput_

   def requestSock( self ):
      return self.requestSock_

   def jsonRpcStatisticsSock( self ):
      return self.jsonRpcStatisticsSock_

   def _getTtyType( self, conn ):
      ttyType = conn.recv( 1 )
      ttyType = ttyType.decode()
      assert ttyType in ( 't', 'u', 's', 'c', 'd' ), "Unknown TTY type %s" % ttyType
      return ttyType

   def _recvFds( self, sock, numFds ):
      return socket.recv_fds( sock, 1, numFds )[ 1 ]

   def _getStdinOutErr( self, conn ):
      if self.ttyType_ == 't': # t is for TTY
         secondaryPtyFd = self._recvFds( conn, 1 )[ 0 ]
         stdin = os.fdopen( os.dup( secondaryPtyFd ), 'r' )
         stdout = os.fdopen( os.dup( secondaryPtyFd ), 'w' )
         stderr = os.fdopen( os.dup( secondaryPtyFd ), 'w' )
         os.close( secondaryPtyFd )
         return stdin, stdout, stderr
      elif self.ttyType_ == 'u': # u is for 3 non-tty files
         fds = self._recvFds( conn, 3 )
         stdin = os.fdopen( fds[ 0 ], 'r' )
         stdout = os.fdopen( fds[ 1 ], 'w' )
         stderr = os.fdopen( fds[ 2 ], 'w' )
         return stdin, stdout, stderr
      elif self.ttyType_ == 's': # s is for simple CLI
         fds = self._recvFds( conn, 2 )
         stdout = os.fdopen( fds[ 0 ], 'w' )
         stderr = os.fdopen( fds[ 1 ], 'w' )
         return None, stdout, stderr
      # c is for cookie, sorry I mean CAPI
      # pylint: disable-next=consider-using-in
      elif self.ttyType_ == 'c' or self.ttyType_ == 'd':
         remoteFd = self._recvFds( conn, 1 )[ 0 ]
         stdout = os.fdopen( remoteFd, 'w' )
         return None, stdout, None

      assert False
      return None, None, None

   def _getEnv( self, conn ):
      env = {}
      envs = FastServUtil.readString( conn ).split( '\x00' )
      if not envs:
         return env

      for i in range( len( envs ) // 2 ):
         key = envs[ i * 2 ]
         val = envs[ i * 2 + 1 ]
         env[ key ] = val
      return env

   def _getOptions( self, conn ):
      cliShellArgsRaw = FastServUtil.readString( conn )
      cliShellArgs = cliShellArgsRaw.split( '\x00' ) if cliShellArgsRaw else []
      self.cliShellArgs_ = cliShellArgs
      return CliArgParser.parseArgs( args=cliShellArgs, standaloneGuards=False )

   def _getAaaUser( self, conn, env ):
      uid = int( FastServUtil.readString( conn ) )
      gid = int( FastServUtil.readString( conn ) )

      aaaAuthnId = env.get( 'AAA_AUTHN_ID', None )
      if aaaAuthnId:
         aaaAuthnId = int( aaaAuthnId )
      aaaAuthnUser = env.get( 'AAA_AUTHN_USER', None )
      return BasicCliSession.AaaUser( aaaAuthnUser, aaaAuthnId, uid, gid )

   def acceptConnection( self, conn ):
      self.options_ = self._getOptions( conn )
      self.env_ = self._getEnv( conn )
      self.aaaUser_ = self._getAaaUser( conn, self.env_ )
      trace( "aaaUser:", self.aaaUser_ )
      self.ctty_ = FastServUtil.readString( conn )
      trace( "ctty:", self.ctty_ )
      self.ttyType_ = self._getTtyType( conn )
      trace( "tty type:", self.ttyType_ )
      self.stdin_, self.stdout_, self.stderr_ = self._getStdinOutErr( conn )
      trace( "received stdin/out/err fds" )
      # Default outgoing encoding on those sockets seems to be ANSI_X3.4-1968, which
      # causes "internal error" when printing utf-8: "UnicodeEncodeError: 'ascii'
      # codec can't encode character... ordinal not in range(128)", see BUG697781
      # This is a py3 specific problem, and only p3 has stdout.reconfigure
      if hasattr( sys.stdout, 'reconfigure' ):
         self.stdout_.reconfigure( encoding='utf-8' )

      if self.ttyType_ == 't':
         self.requestSock_ = FastServUtil.recvSock( conn )
         self.cliInputSock_ = FastServUtil.recvSock( conn )
         assert self.requestSock_ and self.cliInputSock_
      if self.ttyType_ == 's':
         self.requestSock_ = FastServUtil.recvSock( conn )
         assert self.requestSock_
      # pylint: disable-next=consider-using-in
      if self.ttyType_ == 'c' or self.ttyType_ == 'd':
         self.requestSock_ = FastServUtil.recvSock( conn )
         self.jsonRpcStatisticsSock_ = FastServUtil.recvSock( conn )
         assert self.requestSock_ and self.jsonRpcStatisticsSock_
      trace( "received socket" )
      if self.cliInputSock_ and not ( self.options_.input_file or
                                      self.options_.command or
                                      self.options_.completions ):
         self.cliInput_ = CliShellLib.RemoteCliInput( self.cliInputSock_ )
      trace( "connection accepted" )

   def cleanup( self ):
      # this isn't strictly needed. The garbage collector will eventually
      # come and clean these guys up. However this might take a little while
      # so we close our FDs manually.
      if self.cliInputSock_:
         self.cliInputSock_.close()
         self.cliInputSock_ = None
      if self.requestSock_:
         self.requestSock_.close()
         self.requestSock_ = None
      if self.jsonRpcStatisticsSock_:
         self.jsonRpcStatisticsSock_.close()
         self.jsonRpcStatisticsSock_ = None
      if self.stdin_:
         self.stdin_.close()
         self.stdin_ = None
      if self.stdout_:
         self.stdout_.close()
         self.stdout_ = None
      if self.stderr_:
         self.stderr_.close()
         self.stderr_ = None

def printCliThreads():
   try:
      print()
      print( 'Activity Lock owner is:', _Tac.activityLockOwner() )
      print( 'Recently run commands:' )
      for i in sorted( BasicCliSession.CMD_HISTORY,
                       key=lambda x: x.startTime if x else -1, reverse=True ):
         if i is None:
            continue

         print( i )

      print( "Python backtraces:" )
      CliPatchSigint.printThreads()

      print( "C++ backtraces:" )
      # pkgdeps: rpmwith %{_bindir}/arstack
      output = Tac.run( [ 'timeout', '-s', 'ABRT', '60', 'arstack',
                        str( os.getpid() ) ], stdout=Tac.CAPTURE, asRoot=True,
                        ignoreReturnCode=True )
      for line in output.splitlines():
         print( '  ', line )
   except OSError:
      pass

class ActivityThreadMonitorThread( threading.Thread ):
   def __init__( self, signalSockMap, cli ):
      super().__init__()
      self.signalSockMap_ = signalSockMap
      self.cli_ = cli

   def run( self ):
      while True:
         try:
            activityLockGrabbed = _Tac.timedAcquireActivityLock(
                                    ACTVITY_THREAD_MONITOR_TIMEOUT )
            if activityLockGrabbed:
               _Tac.releaseActivityLock()
            else:
               lockOwner = _Tac.activityLockOwner()
               threadCmdList = BasicCliSession.THREAD_ID_CMD_MAPPING.get( lockOwner )
               suspectedCmd = threadCmdList[ -1 ] if threadCmdList else None
               print()
               print( "Waited", ACTVITY_THREAD_MONITOR_TIMEOUT,
                      "s to grab activitylock but was unable to, suspected cmd:",
                      suspectedCmd )
               print()
               print( "Additional state of the system:" )
               printCliThreads()
               ts = time.monotonic() # the below needs the lock: measure extra wait
               if self.cli_.activityLockMonitorEnabled():
                  cmd = suspectedCmd.cmd if suspectedCmd else 'Unknown'
                  SYS_LOCK_CONTENTION( ACTVITY_THREAD_MONITOR_TIMEOUT, cmd )
               extraWait = time.monotonic() - ts
               if extraWait > ACTVITY_THREAD_MONITOR_TIMEOUT:
                  print( f"Extra wait to grab lock: {extraWait:.2f}s" )

            time.sleep( 2.5 )
         except Exception as e: # pylint: disable-msg=broad-except
            print( "ActivityThreadMonitorThread Exception", e )
            time.sleep( 2.5 )

class CliCtrlThread( threading.Thread ):
   def __init__( self, sysname, signalSockMap ):
      super().__init__()
      self.sysname_ = sysname
      self.signalSockMap_ = signalSockMap

   def _getThreadInfoDumps( self ):
      outputFile = tempfile.TemporaryFile( mode='w+t' )
      sys.stdout.setFile( outputFile )
      sys.stderr.setFile( outputFile )
      printCliThreads()
      sys.stdout.unsetFile()
      sys.stderr.unsetFile()
      outputFile.seek( 0 )
      return outputFile.read()

   def run( self ):
      CliPatchSigint.createState( "CliCtrl" )
      serverSock = socket.socket( socket.AF_UNIX, socket.SOCK_STREAM, 0 )
      fcntl.fcntl( serverSock, fcntl.F_SETFD,
                   fcntl.fcntl( serverSock, fcntl.F_GETFD ) | fcntl.FD_CLOEXEC )
      serverSock.bind( CliCommon.CLI_CTRL_ADDRESS_FMT % self.sysname_ )
      serverSock.setsockopt( socket.SOL_SOCKET, socket.SO_REUSEADDR, 1 )
      serverSock.listen( CONNECTION_BACKLOG )
      serverSock.setblocking( 0 )

      while True:
         _, _, _ = select.select( [ serverSock ], [], [] )
         conn, _ = serverSock.accept()
         threadInfoDump = self._getThreadInfoDumps()
         try:
            print( threadInfoDump ) # print to the log file
         except OSError:
            pass
         conn.sendall( threadInfoDump.encode() ) # pylint: disable-msg=E1101
         conn.close()

class CliThread( threading.Thread ):
   def __init__( self, entityManager, cli, signalSock, conn, threadStartEvent,
                 subprocMgr ):
      super().__init__()
      self.entityManager_ = entityManager
      self.cli_ = cli
      self.conn_ = conn
      self.signalSock_ = signalSock
      self.tid_ = None
      self.childPids_ = [] # written by atfork handler
      self.threadStartEvent_ = threadStartEvent
      self.subprocMgr_ = subprocMgr
      # Sanity check that ConfigAgentis not running in
      # 'multiThreadedWithThreadLocals' mode. It should not as each
      # short lived CLI thread will then allocate from new empty pages
      # that likely will never be filled, as once the CLI thread
      # terminates, they are handed off to the 'main thread' that
      # likely will end up accumulating lots of partially filled
      # pages.
      threadMode = ArSlabPython.slabAllocatorThreadMode()
      assert threadMode != 'multiThreadedWithThreadLocals'

   def _unshare( self ):
      # currently we only unshare the FS. Description from man page:
      # The thread works on a copy of the file system information of the
      # calling process at the time of the clone() call.
      CLONE_FS = 512
      result = libc.unshare( CLONE_FS )
      if result != 0:
         errnoVal = ctypes.get_errno()
         raise OSError( "unshare() called failed (errno={} ({}))".format(
                        errnoVal, errno.errorcode.get( errnoVal, "?" ) ) )

   def _setEnv( self, cliShellCtx ):
      # Update CliThreadInfo
      CliThreadCtrl.setEnv( self.tid_, cliShellCtx.ctty() )
      # Set uid/gid of the current thread
      Syscall.setregid( cliShellCtx.aaaUser().gid,
                        cliShellCtx.aaaUser().gid )
      # keep saved uid as 0, so we can get back to root if we want.
      Syscall.setresuid( cliShellCtx.aaaUser().uid,
                         cliShellCtx.aaaUser().uid,
                         # keep saved uid as 0
                         -1 )

      for key, value in cliShellCtx.env().items():
         os.environ[ key ] = value
      if 'PWD' in os.environ:
         try:
            # we can do this because we have CLONE_FS in _unshare
            os.chdir( os.environ[ 'PWD' ] )
         except OSError:
            pass

      # Add the sbin dirs to PATH.  This is necessary because Cli is the user's
      # shell and /bin/login only puts the sbin dirs in root's path.  We want to
      # avoid every single CliPlugin that runs external commands needing to
      # manipulate the path or specify an absolute path.
      extraPathDirs = ( '/usr/local/sbin', '/usr/sbin', '/sbin' )
      currPath = os.environ.get( 'PATH', "" )
      currPathDirs = currPath.split( ':' )
      for d in extraPathDirs:
         if not d in currPathDirs:
            currPathDirs.insert( 0, d )
      os.environ[ 'PATH' ] = ':'.join( currPathDirs )

      # Set the 'NOPDB' environment variable before importing Tac to prevent us from
      # ever dropping into PDB (see tacc/Tac/Excepthook.py), as doing so could
      # permit a security violation.
      os.environ[ 'NOPDB' ] = '1'

   def killChildThreads( self, signum=signal.SIGKILL ):
      # we will iterate through all of our children and kill
      # their process groups
      for childPid in self.childPids_:
         if ( signum == signal.SIGINT and
              self.subprocMgr_.skipChildSigInt( childPid ) ):
            continue
         try:
            trace( "kill child", childPid, "signal", signum )
            os.killpg( childPid, signum )
         except OSError:
            pass
      self.subprocMgr_.removePids( self.childPids_ )
      self.childPids_ = []

   def _runSimpleCliConnection( self, cliShellCtx ):
      sys.stdout.setFile( cliShellCtx.stdout() )
      sys.stderr.setFile( cliShellCtx.stderr() )
      options = cliShellCtx.options()
      session = BasicCliSession.Session( BasicCli.UnprivMode,
                                         self.entityManager_,
                                         privLevel=options.privilege,
                                         disableAaa=options.disable_aaa,
                                         disableGuards=options.disable_guards,
                                    autoComplete=not options.disable_autocomplete,
                                         startupConfig=False,
                                         secureMonitor=False,
                                         standalone=False,
                                         interactive=False,
                                         disableAutoMore=True,
                                         cli=self.cli_,
                                         aaaUser=cliShellCtx.aaaUser() )
      try:
         Cli.runFrontendCmds( session, cliShellCtx.requestSock() )
      finally:
         session.close()

   def _runCapiConnection( self, cliShellCtx, stateless ):
      # We replace our std(in|out|err) with a dup of the 0,1,2 for CAPI.
      # The reason for this is not intuitive as normally if we don't have
      # a file set we will be setup as the original file. However
      # during the exeuction of a capi request we want to capture all of the
      # output of a command, so we will create a tmp file and dup its fileno
      # to stdout.fileno(). If we don't replace our file with a dup of the original
      # then we will be duping over 0,1,2 whcih will capture output for the entire
      # process.
      sys.stdin.setFile( os.fdopen( os.dup( 0 ), 'r' ) )
      sys.stdout.setFile( os.fdopen( os.dup( 1 ), 'w' ) )
      sys.stderr.setFile( os.fdopen( os.dup( 2 ), 'w' ) )
      # we create a jsonCli to be able to process our request.
      options = cliShellCtx.options()
      session = BasicCliSession.Session( BasicCli.UnprivMode,
                                  self.entityManager_,
                                  privLevel=options.privilege,
                                  disableAaa=options.disable_aaa,
                                  disableGuards=options.disable_guards,
                                  disableAutoMore=True,
                                  isEapiClient=True,
                                  standalone=self.entityManager_.isLocalEm(),
                                  cli=self.cli_,
                                  aaaUser=cliShellCtx.aaaUser() )
      capiExecutor = CliApi.CapiExecutor( self.cli_, session, stateless=stateless )
      jsonCli = JsonCli.JsonCli( capiExecutor )
      statisticsSock = cliShellCtx.jsonRpcStatisticsSock()
      stdoutFileno = cliShellCtx.stdout().fileno()

      try:
         while True:
            jsonRpcRequest = FastServUtil.readString( cliShellCtx.requestSock() )
            if not jsonRpcRequest and not stateless:
               break
            requestCnt, cmdCnt = jsonCli.processRequest( jsonRpcRequest,
                                                         stdoutFileno )
            FastServUtil.writeInteger( statisticsSock, requestCnt )
            FastServUtil.writeInteger( statisticsSock, cmdCnt )
            if stateless:
               # in the case stateless we don't actually want a loop, so break
               # after first iteration
               break
      finally:
         if not stateless:
            capiExecutor.gotoUnprivMode()
         session.close()

   def _runCliConnection( self, cliShellCtx ):
      sys.stdin.setFile( cliShellCtx.stdin() )
      sys.stdout.setFile( cliShellCtx.stdout() )
      sys.stderr.setFile( cliShellCtx.stderr() )
      options = cliShellCtx.options()
      self.cli_.loadDynamicAliases()
      disableAutoMore = options.disable_automore or options.completions
      session = BasicCliSession.Session( BasicCli.UnprivMode,
                                         self.entityManager_,
                                         privLevel=options.privilege,
                                         disableAaa=options.disable_aaa,
                                         disableAutoMore=disableAutoMore,
                                         disableGuards=options.disable_guards,
                                         startupConfig=options.startup_config,
                                    secureMonitor=SecMonUtil.getCliOption( options ),
                                         standalone=options.standalone,
                                    autoComplete=not options.disable_autocomplete,
                                         interactive=True,
                                         cliInput=cliShellCtx.cliInput(),
                                         cli=self.cli_,
                                         aaaUser=cliShellCtx.aaaUser() )
      try:
         return Cli.main( self.cli_, self.entityManager_, session,
                          cliShellCtx.options(),
                          frontendSock=cliShellCtx.requestSock() )
      finally:
         session.close()

   def run( self ):
      self.tid_ = Syscall.gettid()

      try:
         CliPatchSigint.createState( "Shell" )
         # We block the CliServer thread until we have this set. This ensures that
         # tid_ will always be set.
         self.threadStartEvent_.set()
         trace( "CLI thread", self.tid_ )
         # we unshare anything at the kernel level that we don't want the threads
         # to share
         self._unshare()

         # run the thread now that we've struck it out on our own
         self._run()
      except: # pylint: disable-msg=W0702
         # for any escaped exception, always unblock the server thread
         self.threadStartEvent_.set()
         raise
      finally:
         CliPatchSigint.deleteState()

   def _run( self ):
      returnCode = 0

      try:
         # we create our CliShellContext and accept the connection. Afterwards we
         # can close our connection
         cliShellCtx = CliShellContext()
         try:
            trace( "CLI accepting connection", self.tid_ )
            cliShellCtx.acceptConnection( self.conn_ )
         finally:
            self.conn_.close()
            self.conn_ = None
         trace( "CLI conn", self.tid_, cliShellCtx.ttyType(),
                cliShellCtx.aaaUser().user )
         qt0( 'CliThread conn', qv( self.tid_ ), qv( cliShellCtx.ttyType() ),
              qv( cliShellCtx.aaaUser().user ) )

         # set anything we need to in our environment
         trace( "CLI setting up our environ", self.tid_ )
         self._setEnv( cliShellCtx )

         # we process the request as a CAPI or as a CLI request
         if cliShellCtx.ttyType() == 'c':
            self._runCapiConnection( cliShellCtx, True )
         elif cliShellCtx.ttyType() == 'd':
            self._runCapiConnection( cliShellCtx, False )
         elif cliShellCtx.ttyType() == 's':
            self._runSimpleCliConnection( cliShellCtx )
         else:
            returnCode = self._runCliConnection( cliShellCtx )
      except OSError as e:
         if e.errno == errno.EPIPE:
            # the frontend rudely disconnected from us
            trace( "CLI the frontend rudely disconnected from us", self.tid_ )
         else:
            os.write(
               1,
               ( "%s %s CliThread Error %s\n" %
                 ( self.tid_, datetime.now(), e ) ).encode()
            )
      except Exception as e: # pylint: disable-msg=W0703
         os.write(
            1,
            ( "%s %s CliThread Error %s\n" %
              ( self.tid_, datetime.now(), e ) ).encode()
         )
         os.write( 1, b'Begin Traceback\n' )
         os.write( 1, traceback.format_exc().encode() + b'\n' )
         os.write( 1, b'End Traceback\n' )
      finally:
         trace( "CLI done, waiting for cleanup...", self.tid_ )
         # ensure that a thread isn't holding the configuration lock
         try:
            BasicCliSession.CONFIG_LOCK.maybeCleanupLock(
                  reason='Auto-released because user logged out' )
         except: # pylint: disable-msg=bare-except
            Excepthook.printException( *sys.exc_info() )
         self.killChildThreads()
         # flush out any remaining bits of output
         sys.stdout.flush()
         sys.stderr.flush()

         # lets stop using our std(in|out|err) and set to back to the original
         if getattr( sys.stdin.threadLocal_, 'localFileStack', None ):
            sys.stdin.unsetFile()
         if getattr( sys.stdout.threadLocal_, 'localFileStack', None ):
            sys.stdout.unsetFile()
         if getattr( sys.stderr.threadLocal_, 'localFileStack', None ):
            sys.stderr.unsetFile()

         # cleanup any files in use by the cliShellCtx
         qt0( 'cliShellCtx cleanup', qv( self.tid_ ) )
         trace( "CLI cleanup", self.tid_ )
         cliShellCtx.cleanup()

         try:
            # if the signal socket is still open lets send a message.
            # this still opens us to a small race between these next
            # 2 lines, but overall it's better than not checking at all
            if self.signalSock_.fileno() != -1:
               FastServUtil.writeInteger( self.signalSock_, returnCode )
         except OSError as e:
            if e.errno == errno.EPIPE:
               # the frontend rudely disconnected from us
               pass
            else:
               print( self.tid_, str( datetime.now() ), 'Signal Socket Error', e )
         except Exception as e: # pylint: disable-msg=W0703
            print( self.tid_, str( datetime.now() ), 'Signal Socket Error', e )

         # flush any errors we printed before the exit the thread
         sys.stdout.flush()
         sys.stderr.flush()
         qt0( 'CliThread exit', qv( self.tid_ ) )

class CliServer:
   CHECK_GC_AND_MEM_INTERVAL = 60
   GC_DISABLED_COUNT_THRESHOLD = 100
   MAX_MEMORY_ALLOWED = int( 3.8 * 2 ** 30 ) # 3.8GB in bytes
   PROC_STAT_FILE = '/proc/self/stat'
   VMBYTES_FIELD_INDEX = -30

   def __init__( self, entityManager, cli ):
      trace( 'CliServer.__init__ enter' )
      self.entityManager_ = entityManager
      self.cli_ = cli
      self.sysname_ = entityManager.sysname()
      self.serverSock_ = self._createSocket( CliCommon.CLI_SERVER_ADDRESS_FMT %
                                             self.sysname_ )
      self.cliForkMonitorSock_ = self._createSocket( '\x00/CliForkMonitor/%s' %
                                                     self.sysname_ )
      # pylint: disable-next=use-dict-literal
      self.signalSockMap_ = dict() # signal sock -> thread
      self.cliCtrlThread_ = CliCtrlThread( self.sysname_, self.signalSockMap_ )
      self.activityThreadMonitorThread_ = ActivityThreadMonitorThread(
                                                            self.signalSockMap_,
                                                            self.cli_ )
      self.subprocMgr_ = CliSubprocMgr.SubprocMgr( self.sysname_ )
      # Variables for the memory monitor
      self.gcDisabledCount = 0
      self.memoryThreshhold = 0
      self.lastCheckTimestamp = 0

      CliThreadCtrl.setSysname( self.sysname_ )
      CliThreadCtrl.registerAtForkHandler()
      _Tac.enableThreadSafePointers()

      os.environ[ 'SYSNAME' ] = self.sysname_

      # WARNING START
      # This is a very subtle line. This modifies os.environ in such a way
      # that if the CLI is multi-thread each thread will have its own
      # environment. This is done using thread.local()
      ThreadLocalEnv.ThreadLocalEnv( os.environ )
      # WARNING END
      CliPatchOs.init() # cant move to CliMain (some Cli tests need setuid)
      CliPatchPdb.init() # cant move to CliMain (we allow pdb in TestCli)

      # we replace our stdin with special files.
      sys.stdin = ThreadLocalFile.ThreadLocalFile( sys.stdin )
      sys.__stdin__ = sys.stdin

      # Not used, but to force installation of the streamcatcher
      # PyServer.StreamCatcher implictly replaces our std(out|err)
      # with special files.
      self.streamCatcher_ = PyServer.StreamCatcher()
      sys.__stdout__ = sys.stdout
      sys.__stderr__ = sys.stderr
      trace( 'CliServer.__init__ exit' )

   def _createSocket( self, address ):
      trace( 'CliServer._createSocket enter' )
      sock = socket.socket( socket.AF_UNIX, socket.SOCK_STREAM, 0 )
      fcntl.fcntl( sock, fcntl.F_SETFD,
                   fcntl.fcntl( sock, fcntl.F_GETFD ) | fcntl.FD_CLOEXEC )
      sock.bind( address )
      sock.listen( CONNECTION_BACKLOG )
      sock.setsockopt( socket.SOL_SOCKET, socket.SO_REUSEADDR, 1 )
      sock.setblocking( 0 )
      trace( 'CliServer._createSocket exit', sock )
      return sock

   def _acceptForkMonitorConnection( self ):
      # this socket is dedicated for forks and should be fast.
      # we get 2 integers, the thread id that is forking, and
      # the childPid that it has fork. We will then find the thread
      # that is belongs to set it's child PID
      conn, _ = self.cliForkMonitorSock_.accept()
      tid = FastServUtil.readInteger( conn )
      childPid = FastServUtil.readInteger( conn )
      skipSigInt = bool( FastServUtil.readInteger( conn ) )
      conn.close()
      if childPid: # could happen when fork failed for lack of memory
         self.subprocMgr_.addPid( childPid, skipSigInt )
         for cliThread in self.signalSockMap_.values():
            if cliThread.tid_ == tid:
               cliThread.childPids_.append( childPid )
               break

   def _closeSignalSock( self, sock ):
      sock.close()
      t = self.signalSockMap_[ sock ]
      del self.signalSockMap_[ sock ]
      qt0( 'thread', qv( t.tid_ if t.tid_ is not None else -1 ), 'closed' )

   def _acceptConnection( self ):
      if connectionsDisabled_:
         # NOTE: This is used in testing only
         print( "new connections disabled" )
         # give it a bit delay
         time.sleep( 5 )
         return

      conn, _ = self.serverSock_.accept() # CliShell Connection
      signalSock = FastServUtil.recvSock( conn )
      if not signalSock:
         connFd = conn.fileno()
         try:
            print( '_acceptConnection: connFd:', connFd,
                  'failed to recv signalSock' )
         except OSError:
            pass # :( unable to log, c'est la vie
         conn.close()
         return

      threadStartEvent = threading.Event()
      t = CliThread( self.entityManager_, self.cli_, signalSock, conn,
                     threadStartEvent, self.subprocMgr_ )
      try:
         # cli threads do not run as root, so should not handle logrotate (RTMIN+9)
         Tac.threadsigmask( signal.SIGRTMIN+9, True )
         t.start()
      except RuntimeError as e:
         try:
            print( 'Unable to start thread!!' )
            print( e )
            printCliThreads()
            sys.stdout.flush()
         except OSError:
            pass # :( unable to log, c'est la vie
         conn.close()
         signalSock.close()
         return
      finally:
         Tac.threadsigmaskRestore()

      # To try to maintain a steady state here by waiting until the thread has been
      # launched.
      threadStartEvent.wait()
      qt0( 'thread', qv( t.tid_ ), 'created' )
      self.signalSockMap_[ signalSock ] = t

   def _cleanupThreads( self ):
      debug( '_cleanupThreads enter' )
      keysToDelete = []
      for s, t in self.signalSockMap_.items():
         if not t.is_alive():
            trace( "marking to cleanup thread", t.tid_, "socket", s )
            keysToDelete.append( s )
      for s in keysToDelete:
         trace( "cleanup socket", s )
         self._closeSignalSock( s )
      debug( '_cleanupThreads exit' )

   def _checkGC( self ):
      '''Log if the Garbage Collector has been disabled for X consecutive loops'''
      if gc.isenabled():
         self.gcDisabledCount = 0
      else:
         self.gcDisabledCount += 1
         if self.gcDisabledCount > self.GC_DISABLED_COUNT_THRESHOLD:
            print( 'Garbage Collector for', ConfigAgentName.name(),
                   'is disabled. Reenabling and running `collect`' )
            gc.enable()
            gc.collect()

   def _printMemUsageReport( self, memory, threads ):
      objects = Agent.garbageReport()
      AgentMemoryUsage( memory=memory, threads=threads,
                        mostCommonObjects=objects ).render()

   def _checkMemUsage( self ):
      '''Print if memory usage has doubled'''
      memory, threads = Agent.getMemAndThreadCount()
      if memory > self.memoryThreshhold:
         # Initially, threshold is zero, will double on startup. We don't care.
         if self.memoryThreshhold:
            name = ConfigAgentName.name()
            print( '\nGarbage Collector is enabled:', gc.isenabled() )
            print( '\nMemory for', name, 'has doubled. Printing reports.\n' )
            self._printMemUsageReport( memory, threads )
            printCliThreads()
         self.memoryThreshhold = 2 * memory

   def _timeToCheckMemory( self ):
      debug( 'CliServer.run time to check memory?' )
      # do not use Tac.now() which requires activity lock
      now = time.monotonic()
      if now - self.lastCheckTimestamp > self.CHECK_GC_AND_MEM_INTERVAL:
         self.lastCheckTimestamp = now
         return True
      debug( 'CliServer.run not time to check memory' )
      return False

   def _maybeOomKillSelf( self ):
      with open( self.PROC_STAT_FILE ) as f:
         vMemBytes = int( f.read().split()[ self.VMBYTES_FIELD_INDEX ] )

      if vMemBytes > self.MAX_MEMORY_ALLOWED:
         ratio = f"({vMemBytes:,}/{self.MAX_MEMORY_ALLOWED:,})"
         print( '\nMaximum allowed memory exceeded.', ratio, 'Exiting.' )
         memory, threads = Agent.getMemAndThreadCount()
         self._printMemUsageReport( memory, threads )
         os.abort()

   def run( self ):
      trace( 'CliServer.run enter' )
      self.cliCtrlThread_.start()
      self.activityThreadMonitorThread_.start()

      try:
         while True:
            self._cleanupThreads()

            if self._timeToCheckMemory():
               debug( 'CliServer.run checking memory' )
               try:
                  self._checkGC()
                  self._checkMemUsage()
                  if sys.maxsize <= 2 ** 32:
                     self._maybeOomKillSelf()
               except OSError: # BUG472732; Blows up if /var/log is full.
                  qt0( 'Failed to check GC and mem usage' )

            try:
               debug( 'CliServer.run starting select!' )
               filesReadyToRead, _, _ = select.select( [ self.serverSock_,
                                                         self.cliForkMonitorSock_ ] +
                                                       list( self.signalSockMap_ ),
                                                       [], [],
                                                       SELECT_TIMEOUT )
            except OSError as e:
               if e.args[ 0 ] == errno.EINTR:
                  # If we get interrupted we just go back to where we were before
                  debug( 'CliServer.run got errno.EINTR!' )
                  continue
               raise # if we get interrupted by something else we should blow up!

            if not filesReadyToRead:
               debug( 'CliServer.run hit timeout!' )
               # we must have hit a timeout
               continue

            trace( 'CliServer.run woke up from our select and have files to read!',
                   filesReadyToRead )
            for f in filesReadyToRead:
               cliThread = self.signalSockMap_.get( f )
               if cliThread:
                  try:
                     signum = FastServUtil.readInteger( f )
                     trace( "thread", cliThread.tid_, "saw signum", signum )
                  except OSError as e:
                     trace( "thread", cliThread.tid_, "saw socket error", e )
                     signum = None
                  if signum:
                     trace( "thread", cliThread.tid_, "received signal", signum )
                     cliThread.killChildThreads( signum )
                     try:
                        if signum == signal.SIGINT:
                           CliPatchSigint.kill( cliThread.tid_ )
                        else:
                           os.kill( cliThread.tid_, signum )
                     except OSError:
                        pass
                  else:
                     trace( "thread", cliThread.tid_, "closing signal socket" )
                     cliThread.killChildThreads()
                     try:
                        trace( "thread", cliThread.tid_, "being sent SIGINT" )
                        CliPatchSigint.kill( cliThread.tid_ )
                     except OSError:
                        pass
                     self._closeSignalSock( f )

               if self.serverSock_ == f:
                  self._acceptConnection()
               if self.cliForkMonitorSock_ == f:
                  self._acceptForkMonitorConnection()

      # Unexpected exceptions in cli server thread causes the thread to exit and the
      # cli to hang: better die so ProcMgr can resatrt it. If we find an exception
      # case that's legit then a specific try/pass can be added where it raised.
      except BaseException as e: # pylint: disable-msg=W0703
         self.logAndAbort( e )

   def logAndAbort( self, exception ):
      try:
         sys.stdout.flush()
         sys.stderr.write( "Unexpected Exception in ConfigAgent server thr: %s\n"
                                                                      % exception )
         sys.stderr.write( 'serverSock_ = %s\n' % self.serverSock_ )
         sys.stderr.write( 'cliForkMonitorSock_ = %s\n' % self.cliForkMonitorSock_ )
         sys.stderr.write( 'signalSockMap_.keys() = %s\n' %
                           list( self.signalSockMap_ ) )
         Excepthook.printException( *sys.exc_info() )
         sys.stderr.flush()
         os.abort()
      except BaseException: # pylint: disable-msg=W0703
         os.abort()

class CliServerThread( threading.Thread ):

   def __init__( self, entityManager, cli ):
      info( 'Creating the CLI server thread' )
      super().__init__()
      self.cli_ = cli
      self.cliServer_ = CliServer( entityManager, cli )
      info( 'Done Creating the CLI server thread' )

   def run( self ):
      info( 'Starting the CLI server thread' )
      CliPatchSigint.createState( "CliServer" )
      Tac.waitFor( self.cli_.pluginsComplete, description='plugins to load',
                   sleep=True )
      info( 'Cli Server thread is running after plugins completed' )
      self.cliServer_.run()
