#!/usr/bin/env python3
# Copyright (c) 2022 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.
#
# Cleans the directory inputed of unecessary agent log files that were left undeleted
# due to crashes and restarts, which resulted in memory bloat specially when the
# files were large but not large enough to cause logrotate to compress them.
#
# The script will only keep the log files associated to a specified number of oldest
# pids or associated to a specified number of newest pids. By default, the files kept
# are those associated with the oldest pid and the three newest ones but these can be
# changed with the usage of its arguments
#
# This script is intended to be invoked by LogArchiver
#
# Example command usage:
# ---------------------
#
#       archivecleanup.py --src /var/log/agents
# or
#       archivecleanup.py -s /var/log/agents
#
# The only required argument is the absolute path to the directory that contains the
# agent logs, with that usually being /var/log/agents.
#
# Additional command options:
# --------------------------
#
#  -o or --oldtokeep: either of these can be used to specify how many of the oldest
#                     pids should be kept. The argument should be a positive integer
#                     otherwise the default value of one will be used
#
# -n or --newtokeep: either of these can be used to specify how many of the newest
#                    pids should be kept. The argument should be a positive integer
#                    otherwise the default value of three will be used
#
# Additional information:
# ----------------------
#
# It is important to note that the number of pids kept are per agent, meaning that
# each agent will be left with at most the sum of the oldest and newest amount of
# pids specified.  With logrotate, each pid may have multiple files due to the
# rotation.  By default, logrotate is configured to keep a total of up to 5 log
# files per agent.  Thus with the default values, this script will keep a total of
# 4 pids per agent, and 5 log files per agent, thus a total of 20 log files per
# agent.
#
# The files considered to be of the same agent ANGENTNAME are those named as either
# AGENTNAME-PID or AGENTNAME-PID.COMPRESSSUFFIX, meaning that agents that compressed
# log files could also be kept or deleted by this script.   Certain agents do have
# "qualifiers".  For example, the Strata agent creates log files for
# "Strata-Linecard0" and "Strata-Fabric0".  For the purposes of this script, those
# are considered to be separate agents.

import argparse
import os
import sys
import syslog
import re
from collections import defaultdict

LOG_PREFIX = 'archivecleanup'

DEFAULT_NUM_OF_OLD_LOGS_TO_KEEP = 1
DEFAULT_NUM_OF_NEW_LOGS_TO_KEEP = 3

AGENT_LOG_PATTERN = r'^([^_\W]+(-[^_\W]+)?)\-\d+$'

def parseFileInfo( path ):
   """
   Params
   path: str
         the path to given file

   Output
   agent: str
          the name of the agent that the given file logged

   time: float
         the last time the file was modified

   pid: str
        the pid number related to the file
   """
   filename = path.rsplit( '/', 1 )[ 1 ]
   namePid = filename.split( '.', 1 )[ 0 ]
   m = re.search( AGENT_LOG_PATTERN, namePid )
   if not m:
      raise FileNotFoundError()
   agent = m.group( 1 )
   pid = namePid.rsplit( '-', 1 )[ 1 ]
   time = os.path.getmtime( path )
   return agent, time, pid

def getAgentLogsMetadata( files ):
   """
   Params
   files: List[ str ]
          specifies the path of the files to be potentially cleaned up.

   Output
   fileDict: Dict[ str, Dict[ str, Tuple[ List[ str ], float ] ] ]
             The returned dict represent the following
             Dict[ agentName, Dict[ pid, Tuple[ List[ path ], mTime ] ] ]
             where List[ path ] is the list of paths of all log files of given agent
             and pid, and mTime is the modified time of one of said files.
             The modified time should be close between all files and as such it is
             not necessary to keep track of the times of all files
   """

   fileDict = defaultdict( lambda: defaultdict( lambda: ( [], 0 ) ) )

   for path in files:
      try:
         agent, time, pid = parseFileInfo( path )
         fileDict[ agent ][ pid ] = ( fileDict[ agent ][ pid ][ 0 ] + [ path ],
                                      time )
      except FileNotFoundError:
         continue

   return fileDict

def getFiles( src ):
   """
   Params
   src: str
        specifies the path to the directory to be cleaned up
        if the path does not exists or does not lead to a directory, the script will
        be killed without doing anything

   Output
   files: List[ str ]
          returns a list of paths to the files in the src directory
   """

   if not os.path.exists( src ) or not os.path.isdir( src ):
      print( "Error: invalid directory path specified for cleanup:", src )
      syslog.syslog( f'%%ERR-LOG: invalid directory path specified for cleanup: \
                       { src }' )
      raise NotADirectoryError()

   files = []
   for fileName in os.listdir( src ):
      path = os.path.join( src, fileName )
      if os.path.isfile( path ):
         files.append( path )
   return files

def deleteUnwantedFiles( fileDict, numOldToKeep, numNewToKeep ):
   """
   Params
   fileDict: Dict[ str, List[ Tuple[ str, float ] ] ]
             the collection represents
             Dict[ agentName, Dict[ pid, Tuple[ List[ path ], mTime ] ] ]
             where List[ path ] is the list of paths of all log files of given agent
             and pid, and mTime is the modified time of one of said files.

   numOldToKeep, numNewToKeep: int
                               both numbers that specify how many pids to be kept
                               both checked if they are positibe numbers, otherwise
                               they are set to the default value

   Effect
      the function deletes the files that are deemed unecessary according to
      the numbers specified by numOldToKeep and numNewToKeep.
   """
   if numOldToKeep <= 0:
      print( f"Error: numOldToKeep argument { numOldToKeep } is invalid, proceeding \
               with default { DEFAULT_NUM_OF_OLD_LOGS_TO_KEEP }" )
      syslog.syslog( f'%%ERR-LOG: invalid numOldToKeep argument: { numOldToKeep }. \
                       Utilized default { DEFAULT_NUM_OF_OLD_LOGS_TO_KEEP }' )
      numOldToKeep = DEFAULT_NUM_OF_OLD_LOGS_TO_KEEP
   if numNewToKeep <= 0:
      print( f"Error: numNewToKeep argument { numNewToKeep } is invalid, proceeding \
               with default { DEFAULT_NUM_OF_NEW_LOGS_TO_KEEP }" )
      syslog.syslog( f'%%ERR-LOG: invalid numOldToKeep argument: { numNewToKeep }. \
                       Utilized default { DEFAULT_NUM_OF_NEW_LOGS_TO_KEEP }' )
      numNewToKeep = DEFAULT_NUM_OF_NEW_LOGS_TO_KEEP

   for agent in fileDict.keys():
      agentFiles = list( fileDict[ agent ].values() )
      agentFiles.sort( key=lambda pathTime: pathTime[ 1 ] )
      numOfFiles = len( agentFiles )
      numFilesToDelete = numOfFiles - ( numOldToKeep + numNewToKeep )
      if numFilesToDelete <= 0:
         continue
      for i in range( numFilesToDelete ):
         for path in agentFiles[ i + numOldToKeep ][ 0 ]:
            try:
               os.remove( path )
            except FileNotFoundError:
               pass
            except OSError:
               return


def main( src, numOldToKeep, numNewToKeep ):
   syslog.openlog( LOG_PREFIX )

   try:
      files = getFiles( src )
   except NotADirectoryError:
      return
   fileDict = getAgentLogsMetadata( files )
   deleteUnwantedFiles( fileDict, numOldToKeep, numNewToKeep )

if __name__ == '__main__':
   parser = argparse.ArgumentParser( prog=sys.argv[ 0 ] )

   parser.add_argument( '-s', '--src',
                        help='Source of the directory to be cleaned',
                        required=True )
   parser.add_argument( '-o', '--oldtokeep',
                        help='Number of oldest logs to be kept',
                        type=int,
                        default=DEFAULT_NUM_OF_OLD_LOGS_TO_KEEP )
   parser.add_argument( '-n', '--newtokeep',
                        help='Number of newest logs to be kept',
                        type=int,
                        default=DEFAULT_NUM_OF_NEW_LOGS_TO_KEEP )

   args = parser.parse_args()
   main( args.src, args.oldtokeep, args.newtokeep )
