#!/usr/bin/env python3
# Copyright (c) 2020 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.

import Cell
import os
import SuperServer
import Plugins
import Tac
import tempfile
import Tracing
from IpLibConsts import DEFAULT_VRF
if os.environ.get( 'P4USER' ):
   import WsServiceHelper

__defaultTraceHandle__ = Tracing.Handle( 'TelegrafMgrSuperServerPlugin' )
t0 = Tracing.trace0
t1 = Tracing.trace1

class DestinationInfluxdbReactor( Tac.Notifiee ):
   notifierTypeName = 'TelegrafMgr::DestinationInfluxdbConfig'

   def __init__( self, notifier, parent ):
      Tac.Notifiee.__init__( self, notifier )
      self.parent_ = parent

   # @Tac.handler( 'vrfName' )
   # def handleVrfName( self ):
   #    t0( 'DestinationInfluxdbReactor handleVrfName' )
   #    self.parent_.handleStartStopRestartService()

   @Tac.handler( 'url' )
   def handleUrl( self ):
      t0( 'DestinationInfluxdbReactor handleUrl' )
      self.parent_.handleStartStopRestartService()

   @Tac.handler( 'databaseName' )
   def handleDatabaseName( self ):
      t0( 'DestinationInfluxdbReactor handleDatabaseName' )
      self.parent_.handleStartStopRestartService()

   @Tac.handler( 'retentionPolicyName' )
   def handleRetentionPolicyName( self ):
      t0( 'DestinationInfluxdbReactor retentionPolicyName' )
      self.parent_.handleStartStopRestartService()

   @Tac.handler( 'username' )
   def handleUsername( self ):
      t0( 'DestinationInfluxdbReactor handleUsername' )
      self.parent_.handleStartStopRestartService()

   @Tac.handler( 'password' )
   def handlePassword( self ):
      t0( 'DestinationInfluxdbReactor handlePassword' )
      self.parent_.handleStartStopRestartService()

class SourceSocketReactor( Tac.Notifiee ):
   notifierTypeName = 'TelegrafMgr::SourceSocketConfig'

   def __init__( self, notifier, parent ):
      Tac.Notifiee.__init__( self, notifier )
      self.parent_ = parent

   @Tac.handler( 'url' )
   def handleUrl( self ):
      t0( 'SourceSocketReactor handleUrl' )
      self.parent_.handleStartStopRestartService()

   @Tac.handler( 'connectionLimit' )
   def handleConnectionLimit( self ):
      t0( 'SourceSocketReactor handleConnectionLimit' )
      self.parent_.handleStartStopRestartService()

class VrfStateReactor( Tac.Notifiee ):
   notifierTypeName = 'Ip::VrfStatusLocal'

   def __init__( self, notifier, parent ):
      Tac.Notifiee.__init__( self, notifier )
      self.parent_ = parent
      self.handleState()

   @Tac.handler( 'state' )
   def handleState( self ):
      t0( 'VrfStateReactor handleState' )
      if self.notifier_.vrfName == self.parent_.config_.vrfName:
         self.parent_.handleStartStopRestartService()

class TelegrafMgrConfigReactor( SuperServer.SystemdService ):
   ''' Manages the telegraf daemon '''
   notifierTypeName = 'TelegrafMgr::TelegrafMgrConfig'

   def __init__( self, telegrafManagerContext, telegrafMgrConfig,
         telegrafMgrStatus, allVrfStatus, ocLaunchRequest ):
      assert os.path.exists( '/usr/bin/telegraf' )

      self.configFile = '/etc/telegraf/telegraf.conf'
      self.envFile = '/etc/default/telegraf'
      self.serviceFile = '/etc/systemd/system/telegraf.service'
      self.defaultConfigFile = '/etc/telegraf/telegraf.conf.base'
      self.defaultServiceFile = '/usr/lib/telegraf/scripts/telegraf.service.default'
      self.telegrafManagerContext_ = telegrafManagerContext
      self.config_ = telegrafMgrConfig
      self.status_ = telegrafMgrStatus
      self.allVrfStatus = allVrfStatus
      self.ocLaunchRequest_ = ocLaunchRequest
      self.hasValidSource = False
      self.hasValidDest = False
      self.vrfActive = False
      self.destinationInfluxdbReactor_ = None
      self.sourceSocketReactor_ = None
      self.allVrfStateReactor_ = None

      if os.environ.get( 'P4USER' ):
         self.serviceName = 'telegraf'
         self.daemonName = 'telegraf'
         self.pidFile = '/var/run/telegraf/telegraf.pid'
         self.errorLog = '/var/log/agents/Telegraf'
         self.wsOpenFileLimit = 65536
         # These startCmds are running in sequence
         # Format:
         # cmd, stdout, stderr, timeout, asDaemon, pidWaitTimeout
         self.startCmds = [
            ( [ 'mkdir', '-p', '/var/log/agents' ], Tac.INHERIT, Tac.INHERIT,
              600, False, 0 ),
            ( [ 'mkdir', '-p', '/var/run/telegraf' ], Tac.INHERIT, Tac.INHERIT,
              600, False, 0 ),
            ( [ 'sh', '-c', f'ulimit -n {self.wsOpenFileLimit}' ],
              Tac.INHERIT, Tac.INHERIT, 600, False, 0 ),
            ( [ 'sh', '-c', 'nohup /usr/bin/telegraf -pidfile {}'
                ' -config {} -config-directory'
                ' /etc/telegraf/telegraf.d >>/dev/null 2>>{} &'.format(
                   self.pidFile, self.configFile, self.errorLog ) ],
              Tac.INHERIT, Tac.INHERIT, 600, False, 600 ),
         ]

      # Put config/service files in place before init SuperServer
      self.loadDefaultFiles()
      SuperServer.SystemdService.__init__(
         self, 'telegraf',
         'telegraf',
         telegrafMgrConfig,
         self.configFile,
         configFileHeaderEnabled=False,
         healthCheckNeeded=True,
      )
      # Overwrite the checking interval to 10 seconds
      self.healthMonitorInterval = 10

      self.createReactors()
      # Make sure initial state is stopped before we check what should happen
      self.stopService()
      self.handleStartStopRestartService()
      # Check if custom file is present
      self._checkServiceHealth()

   def customConfigChanged( self ):
      output = Tac.run( [ 'diff', self.status_.customConfig.filename,
                          self.configFile ],
                        stdout=Tac.CAPTURE, asRoot=True,
                        ignoreReturnCode=True )
      return bool( output )

   # The implementation of TelegrafMgr has already gone far enough
   # that it effectively nullifies the purpose of most of base implementation
   # of SuperServer. The base implementation relies on the conf() to
   # return the new config, and compare with the file we pointed for
   # controlling restarting the agent. However, it's limited to base
   # class onAttribute change, and TelegrafMgr already has its own
   # explicit control over attributes in multiple nested layers.

   # We disable kicking the base implemnetation of _maybeRestartService
   # ever because as it stands it doesn't do anything useful. There is
   # also a chance that it unexpectedly disarms the timer for
   # _checkServiceHealth, that no other code path enables it back again.
   def _maybeRestartService( self ):
      pass

   def sync( self ):
      pass

   def checkCustomConfigFile( self ):
      # We found new custom config file added
      if not self.status_.customConfig.filename and \
         os.path.exists( '/mnt/flash/telegraf.conf' ):
         t0( 'new custom config file found' )
         # below will call updateConfigFile
         self.handleStartStopRestartService()
         # Custom config file removed
      elif self.status_.customConfig.filename and \
           not os.path.exists( '/mnt/flash/telegraf.conf' ):
         t0( 'custom config file removed' )
         # below will call updateConfigFile
         self.handleStartStopRestartService()
         # Detect custom config file change. At this point there is
         # definitely a config at /etc/telegraf/telegraf.conf
      elif self.status_.customConfig.filename and \
           os.path.exists( self.configFile ) and \
           os.path.exists( self.status_.customConfig.filename ) and \
           self.customConfigChanged():
         t0( 'custom config file changed' )
         self.handleStartStopRestartService()

   def _checkServiceHealth( self ):
      t0( 'checkServiceHealth' )
      # check daemon is running properly
      super()._checkServiceHealth()

      self.checkCustomConfigFile()

   def loadDefaultFiles( self ):
      # Load config and service files packaged wtih InfluxTelemetry
      Tac.run( [ 'cp', self.defaultConfigFile, self.configFile ], asRoot=True )
      Tac.run( [ 'cp', self.defaultServiceFile, self.serviceFile ],
               asRoot=True )
      # Actually make systemd reload the new service file
      if not os.environ.get( 'P4USER' ):
         Tac.run( [ 'systemctl', 'daemon-reload' ], asRoot=True )

   def createReactors( self ):
      self.destinationInfluxdbReactor_ = Tac.collectionChangeReactor(
         self.config_.destinationInfluxdbConfig,
         DestinationInfluxdbReactor,
         reactorArgs=( self, )
      )
      self.sourceSocketReactor_ = Tac.collectionChangeReactor(
         self.config_.sourceSocketConfig,
         SourceSocketReactor,
         reactorArgs=( self, )
      )
      self.allVrfStateReactor_ = Tac.collectionChangeReactor(
         self.allVrfStatus.vrf,
         VrfStateReactor,
         reactorArgs=( self, )
      )

   def serviceEnabled( self ):
      t0( 'serviceEnabled called' )
      return self.shouldTelegrafRun()

   def serviceProcessWarm( self ):
      t0( 'serviceProcessWarm called.' )
      return self.serviceEnabled()

   def conf( self ):
      t0( 'conf called.' )
      try:
         with open( self.confFilename_ ) as f:
            config = f.read()
            return config
      except OSError:
         t0( 'Error: invalid config file' )
         return ''

   def shouldTelegrafRunOcta( self ):
      # only if the config has gnmi_arista, we need Octa to run
      return "[[inputs.gnmi_arista]]" in self.conf()

   def shouldTelegrafRun( self ):
      if self.status_.customConfig.filename and \
         os.path.exists( self.status_.customConfig.filename ):
         # Start telegraf if user wants to overwrite all decision
         return True

      # Default decision based on having valid source/dest/vrf
      return self.hasValidSource and self.hasValidDest and self.vrfActive

   def writeDestinationInfluxdbConfig( self, configFile ):
      with open( configFile, 'a' ) as f:
         for destName, dest in \
                  self.config_.destinationInfluxdbConfig.items():
            url = getattr( dest, 'url', None )
            dbName = getattr( dest, 'databaseName', None )
            retentionPolicy = getattr( dest, 'retentionPolicyName', None )
            if url and dbName and retentionPolicy:
               conf = ''
               conf += '\n[[outputs.influxdb]]\n'
               conf += f'  # destination influxdb {destName}\n'
               conf += f'  urls = [\"{url}\"]\n'
               conf += f'  database = \"{dbName}\"\n'
               conf += '  retention_policy = \"{}\"\n'.format(
                  '' if retentionPolicy == 'default' else retentionPolicy
               )
               if getattr( dest, 'username', None ):
                  conf += f'  username = \"{dest.username}\"\n'
                  # reverse the password back to plain text
                  conf += f'  password = \"{dest.password.getClearText()}\"\n'
               # additional configs
               conf += '  write_consistency = \"any\"\n'
               conf += '  timeout = \"5s\"\n'
               conf += '  ssl_ca = \"\"\n'
               conf += '  ssl_cert = \"\"\n'
               conf += '  ssl_key = \"\"\n'
               conf += '  insecure_skip_verify = true\n'
               f.write( conf )
               self.hasValidDest = True

   def genSourceSocketConfigFile( self, socketName, socketConf, comment=None ):
      conf = ''
      conf += '\n[[inputs.socket_listener]]\n'
      if comment:
         conf += f'  # {comment}\n'
      conf += '  service_address = \"{}\"\n'.format(
         socketConf.url
      )
      if getattr( socketConf, 'connectionLimit', None ):
         conf += '  max_connections = {}\n'.format(
            socketConf.connectionLimit )
      conf += '  socket_mode = \"777\"\n'

      return conf

   def writeSourceSocketConfig( self, configFile ):
      with open( configFile, 'a' ) as f:
         for socketName, socketConf in \
             self.config_.sourceSocketConfig.items():
            if getattr( socketConf, 'url', None ) is not None:
               f.write( self.genSourceSocketConfigFile(
                  socketName, socketConf,
                  f'source socket {socketName}' ) )
               self.hasValidSource = True

   def writeGlobalTagsConfig( self, configFile ):
      with open( configFile, 'a' ) as f:
         conf = ''
         conf += '\n[global_tags]\n'
         if not os.environ.get( 'P4USER' ):
            serialNumber = Tac.run(
               [ 'sh', '-c',
                 'cat /etc/prefdl 2>&1 | grep -w SerialNumber: | cut -d: -f2' ],
               stdout=Tac.CAPTURE ).strip()
            conf += f'  serial_number = \"{serialNumber}\"\n'
            sku = Tac.run(
               [ 'sh', '-c',
                 'cat /etc/prefdl 2>&1 | grep -w SKU: | cut -d: -f2' ],
               stdout=Tac.CAPTURE ).strip()
            conf += f'  model_name = \"{sku}\"\n'
         for key, value in self.config_.globalTags.items():
            conf += f'  {key} = \"{value}\"\n'
         f.write( conf )

   def writeSourceGroupConfig( self, configFile ):
      with open( configFile, 'a' ) as f:
         # always print out the standard config first, then print out the sorted
         # configs after that
         if self.config_.sourceGroups.get( 'standard', 'disabled' ) == 'enabled':
            f.write( self.telegrafManagerContext_.sourceGroupConfig( 'standard' ) )
            self.hasValidSource = True

         for group, status in sorted( self.config_.sourceGroups.items() ):
            if group == 'standard' or status == 'disabled':
               continue
            f.write( self.telegrafManagerContext_.sourceGroupConfig( group ) )
            self.hasValidSource = True

   def updateFinalConfig( self, filename ):
      # This is to tighten the permission of the final config file
      # that may include credentials. Telegraf is running as root so
      # this will still work for Telegraf itself.
      t0( self.configFile )
      Tac.run( [ 'cp', filename, self.configFile ], asRoot=True )
      Tac.run( [ 'sync' ], asRoot=True )
      assert os.path.exists( self.configFile )
      Tac.run( [ 'chmod', '600', self.configFile ], asRoot=True )

   def updateConfigFile( self ):
      t0( 'updateConfigFile' )
      if os.path.exists( '/mnt/flash/telegraf.conf' ):
         # We take this as the full config file.
         self.status_.customConfig = Tac.Value(
            'TelegrafMgr::CustomConfigFile',
            '/mnt/flash/telegraf.conf',
            Tac.Type(
               'TelegrafMgr::CustomConfigFileType' ).full
         )
         self.updateFinalConfig(
            '/mnt/flash/telegraf.conf' )
         return
      else:
         self.status_.customConfig = Tac.Value(
            'TelegrafMgr::CustomConfigFile' )

      try:
         with tempfile.NamedTemporaryFile() as f:
            # Take the base config file and put in place
            baseFile = '/etc/telegraf/telegraf.conf.base'
            t0( baseFile )
            t0( f.name )
            Tac.run( [ 'cp', baseFile, f.name ], asRoot=True )
            self.writeGlobalTagsConfig( f.name )
            self.writeSourceGroupConfig( f.name )
            self.writeSourceSocketConfig( f.name )
            self.writeDestinationInfluxdbConfig( f.name )
            self.updateFinalConfig( f.name )
      except OSError as e:
         # best effort, print error and not update if
         # we have trouble writing to temp file
         t0( e )
         return

   def updateServiceFile( self ):
      vrfName = self.config_.vrfName
      if vrfName == DEFAULT_VRF:
         nsName = vrfName
         self.vrfActive = True
      else:
         nsName = 'ns-' + vrfName
         vrf = self.allVrfStatus.vrf.get( vrfName )
         if vrf and vrf.state == "active":
            self.vrfActive = True

      try:
         with tempfile.NamedTemporaryFile() as f:
            f.file.write( f'NET_NS={nsName}\n'.encode() )
            f.flush()
            Tac.run( [ 'cp', f.name, self.envFile ], asRoot=True )
            Tac.run( [ 'sync' ], asRoot=True )
      except OSError as e:
         # best effort, print error and not update if
         # we have trouble writing to temp file
         t0( e )
         return

   def handleStartStopRestartService( self ):
      t0( 'handleStartStopRestartService' )
      self.hasValidSource = False
      self.hasValidDest = False
      self.vrfActive = False
      self.updateConfigFile()
      self.updateServiceFile()
      shouldTelegrafRun = self.shouldTelegrafRun()
      shouldRunOcta = shouldTelegrafRun and self.shouldTelegrafRunOcta()
      # Octa should only be running if eos-native paths are used to collect stats.
      if shouldRunOcta:
         self.ocLaunchRequest_.createEntity( 'Tac::Dir', 'telegraf' )
      else:
         self.ocLaunchRequest_.deleteEntity( 'telegraf' )

      if shouldTelegrafRun:
         if self.status_.running:
            t0( 'Restart telegraf service' )
            self.restartService()
         else:
            t0( 'Start telegraf service' )
            self.startService()
            self.status_.running = True
      else:
         if self.status_.running:
            t0( 'Stop telegraf service' )
            self.stopService()
            self.status_.running = False

   @Tac.handler( 'vrfName' )
   def handleVrfName( self ):
      t0( 'TelegrafMgrConfigReactor handleVrfName' )
      self.handleStartStopRestartService()

   @Tac.handler( 'destinationInfluxdbConfig' )
   def handleDestinationInfluxdbConfig( self, key=None ):
      t0( 'TelegrafMgrConfigReactor handleDestinationInfluxdbConfig' )
      # The condition for starting service is, if there is
      # a single legitimate destination influxdb connection,
      # we should keep telegraf running
      self.handleStartStopRestartService()

   @Tac.handler( 'sourceSocketConfig' )
   def handleSourceSocketConfig( self, key=None ):
      t0( 'TelegrafMgrConfigReactor handleSourceSocketConfig' )
      self.handleStartStopRestartService()

   @Tac.handler( 'globalTags' )
   def handleGlobalTags( self, key ):
      t0( 'TelegrafMgrConfigReactor handleGlobalTags' )
      self.handleStartStopRestartService()

   @Tac.handler( 'sourceGroups' )
   def handleSourceGroups( self, key ):
      t0( 'TelegrafMgrConfigReactor handleSourceGroups' )
      self.handleStartStopRestartService()

   def serviceCmd( self, cmd ):
      if os.environ.get( 'P4USER' ):
         if cmd == 'start':
            return WsServiceHelper.start(
               self.serviceName, self.daemonName, self.startCmds,
               self.pidFile, self.configFile )
         elif cmd == 'stop':
            return WsServiceHelper.stop( self.serviceName, self.daemonName,
                         self.pidFile )
         elif cmd == 'restart':
            return WsServiceHelper.restart(
               self.serviceName, self.daemonName, self.startCmds,
               self.pidFile, self.configFile )
         elif cmd == 'status':
            return WsServiceHelper.status( self.daemonName, self.pidFile )

      return SuperServer.SystemdService.serviceCmd( self, cmd )

class TelegrafMgr( SuperServer.SuperServerAgent ):
   def __init__( self, entityManager, telegrafManagerContext ):
      SuperServer.SuperServerAgent.__init__( self, entityManager )
      self.telegrafManagerContext_ = telegrafManagerContext
      mg = entityManager.mountGroup()
      self.telegrafMgrConfig = mg.mount( 'telegrafMgr/config',
                                         'TelegrafMgr::TelegrafMgrConfig', 'r' )
      self.telegrafMgrStatus = mg.mount( 'telegrafMgr/status',
                                         'TelegrafMgr::TelegrafMgrStatus', 'w' )
      self.allVrfStatus = mg.mount( Cell.path( "ip/vrf/status/local" ),
                                    'Ip::AllVrfStatusLocal', 'r' )
      self.ocLaunchRequest = mg.mount( 'mgmt/gnmi/launchRequest',
                                       'Tac::Dir', 'w' )
      self.telegrafMgrReactor_ = None

      def _finished():
         if not self.active():
            return
         self.createReactors()
      mg.close( _finished )

   def createReactors( self ):
      self.telegrafMgrReactor_ = TelegrafMgrConfigReactor(
         self.telegrafManagerContext_,
         self.telegrafMgrConfig, self.telegrafMgrStatus,
         self.allVrfStatus, self.ocLaunchRequest
      )

   def onSwitchover( self, protocol ):
      self.createReactors()

class TelegrafManagerContext():
   def __init__( self ):
      self.sourceGroups_ = {}

   def sourceGroupConfigIs( self, sourceGroup, config ):
      t0( 'adding source group', sourceGroup, 'with config', config )
      self.sourceGroups_[ sourceGroup ] = config

   def sourceGroupConfig( self, sourceGroup ):
      t0( 'looking up source group config', sourceGroup )
      return self.sourceGroups_.get( sourceGroup, '' )

def Plugin( ctx ):
   telegrafManagerContext = TelegrafManagerContext()
   Plugins.loadPlugins( "TelegrafConfigPlugin", context=telegrafManagerContext )
   ctx.registerService( TelegrafMgr( ctx.entityManager, telegrafManagerContext ) )
