#!/usr/bin/env python3
# Copyright (c) 2024 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.
"""
   Providing supplremental code for the rasdaemon that likely could not be
   integrated upstream to the rasdaemon. Removes nullbytes and generates
   a helpful message for Administrators that if a Corrected memory error is
   repeated for a given memory address that they should Contact Support
   or for any Uncorrected memory error they should Contact Support.

   Replaced kernel patch (BUG940730):
   arista-BUG61745-log-ecc-errors-with-friendly-message.patch

   For Corrected memory error, we rate-limit the number of messages
   written for a particular memory address to once every 30 minutes
   per memory address, to prevent the messages file from being flooded
   with these messages. The kernel mce rate limiting has similar rate
   limiting.

   New with the rasdcohort: we log our "Helpful" log ecc error friendly
   messages so that they are also recorded in /var/log/eos. The kernel
   messages just went into /var/log/messages but we can request the
   messages go into /var/log/eos so that the Cli show logging command
   will print these messages out.

   Note: As long as messages go into the dmesg buffer they will reported
   in show tech file output. See BUG940730 for examples and details.

   Author: echron@arista.com
   Date:   2024/04/23
"""
import re
import sys
import syslog
import time

# Output file
LOGFILE = "/var/log/mcelog"
DMESGFILE = "/dev/kmsg"
USER_WARNING = "<12>"
USER_ERROR = "<11>"

addrs: dict[ str, int ] = {}

def msgwrite( pre: str, msg: str, addrvalue: str, facilpri: str ) -> None:
   """
   Write message to messages file and dmesg file
   """
   amsg = pre + msg
   if addrvalue:
      amsg = msg + f"Memory Address: 0x{addrvalue}"
   if facilpri == USER_ERROR:
      syslog.syslog( syslog.LOG_ERR, amsg )
   else:
      syslog.syslog( syslog.LOG_WARNING, amsg )
   dmsg = facilpri + msg + '\n'
   with open( DMESGFILE, "w", encoding='utf-8' ) as dfile:
      dfile.write( dmsg )
      dfile.flush()

def uncorrected_error( addrvalue: str ) -> None:
   """
   Print Helpful Uncorrected Error Message
   """
   pre = "%MCE-3-UNCORRECTED_ERROR: "
   msg = "mce: [EDAC]: Uncorrected memory error. Contact Support."
   msgwrite( pre, msg, addrvalue, USER_ERROR )

def repeat_corrected_error( addrvalue: str ) -> None:
   """
   Print Helpful Repeated Corrected Error Message
   """
   pre = "%MCE-3-CORRECTED_ERROR: "
   msg = "mce: [EDAC]: Repeated Memory error corrected Contact Support."
   msgwrite( pre, msg, addrvalue, USER_ERROR )

def corrected_error_warn( addrvalue: str ) -> None:
   """
   Print Helpful Corrected Error Message (first occurence)
   """
   pre = "%MCE-4-CORRECTED_WARNING: "
   msg = "mce: [EDAC]: Memory error corrected by hardware. "
   msgwrite( pre, msg, addrvalue, USER_WARNING )

def recproc() -> None:
   """
   Process each record to remove null characters and also count occurences of
   records we want track counts on.

   We count occurences for a given memory address and the time since the
   last time in seconds since we printed a Contact Support message for that
   address so we can rate limit the number of those message produced.

   We should always have an address in the mce_record but just in case we
   don't we'll still print a helpful message.
   """
   addr_match = re.compile( r'addr= ([a-fA-F0-9]+),' )
   with open( LOGFILE, 'a', encoding='utf-8', errors='replace' ) as ofile:
      # Read input records when they appear
      for line in sys.stdin:
         if 'mce_record:' in line:
            cline = re.sub( '\x00', '', line )
            ofile.write( cline )
            ofile.flush()
            match = addr_match.search( cline )
            if match:
               address = match.group( 1 )
            else:
               address = None
            if "Uncorrected memory error" in cline:
               uncorrected_error( address )
            else:
               ctime = int( time.time() )
               if address not in addrs:
                  if address is not None:
                     info = {}
                     info[ 'count' ] = 1
                     info[ 'secs' ] = ctime
                     addrs[ address ] = info
                  corrected_error_warn( address )
               else:
                  addrs[ address ][ 'count' ] += 1
                  addrs[ address ][ 'secs' ] = ctime
                  repeat_corrected_error( address )

def main():
   """ Start up the rasdaemon cohort to assit in processing mce records """
   syslog.openlog( ident="rasdcohort", logoption=syslog.LOG_PID,
                   facility=syslog.LOG_LOCAL4 )
   recproc()
   syslog.closelog()

if __name__ == "__main__":
   main()
