#!/bin/bash

# Function to check the platform 
is_veos() {
   grep "platform=veos" /proc/cmdline > /dev/null 2>&1
   if [ $? -eq 0 ]; then
      echo "1"
      return
   fi
   echo "0"
}

# Function returns a list of ports starting with "et" or "ma"
# We assume those ports to be ethernet ports
find_ethernet_ports() {
   et_pattern="et*"
   out_list=
   for i in $(ls -d /sys/class/net/${et_pattern} 2>/dev/null); do
      fname=$(basename ${i})
      out_list="${out_list} ${fname}"
   done
   et_pattern="ma*"
   for i in $(ls -d /sys/class/net/${et_pattern} 2>/dev/null); do
      fname=$(basename ${i})
      out_list="${out_list} ${fname}"
   done
   echo "${out_list}"
}

# Figure out what kind of ethernet port is this
# Pass port name as $1
find_ethernet_porttype() {
   portname=$1
   et_type=$(ethtool -i ${portname} | grep driver | awk -F: '{ print $2 }')
   if [ $? -ne 0 ]; then
      echo ""
   else
      echo "${et_type}"
   fi
}

# We need to map all IRQs to a specified set of CPU core so that
# while processing data-plane XFRM/IPsec packets in the kernel,
# we do not have any interference from user-space EOS processes
# trying to use the FPU instructions and thus pushing XFM dataplane
# into a async packet mode. The "make EOS user-space use only
# a subset of CPUs", fix is being done to Aboot and will be enforced
# via a kernel command line param. Here, we use the same equation to
# isolate a certain number of cores for EOS and assign the remaining
# cores to each ethernet RX hard IRQ.
# We only enable this if the number of cores is > 2 _and_ isolcpus is 
# set on kernel command line
# It takes a parameter which defines if we need to set aside a core
# for irqs or not. 
# This param should be false (0) by default and true (1) if configured
non_eos_cores_bit_map() {
   CORES=$(getconf _NPROCESSORS_ONLN)
   # If <= 2 cores, don't assign any cores only for data plane processing.
   # Remember not to set "0" as smp_affinity or it may break the kernel
   if [ $CORES -lt 3 ]; then
      echo "0"
      return
   fi

   grep isolcpus /proc/cmdline > /dev/null 2>&1
   if [ $? -ne 0 ]; then
      echo "0"
      return
   fi

   # If total number of cores > 2 then leave 2 cores for EOS
   bm=0
   start=2

   for ((i=start; i<CORES; i++)) {
      bm=$((bm|1<<i))
   }

   bm=$(printf "%x" $bm)
   echo "$bm"
}

cores_bit_map() {
   CORES=$(getconf _NPROCESSORS_ONLN)
   bm=0
   for ((i=0; i<CORES; i++)) {
      bm=$((bm|1<<i))
   }
   bm=$(printf "%x" $bm)
   echo "$bm"
}

# Global variable to save this instead of calculating over & over.
NON_EOS_CPU_BM=`non_eos_cores_bit_map`
TOTAL_CORES_BM=`cores_bit_map`

# Set smp_affinity for all irq 
set_smp_affinity() {
   bm=0
   CORES=$(getconf _NPROCESSORS_ONLN)

   # Two or less cores ? Don't do anything
   if [ $CORES -le 2 ]; then
      return
   fi

   bm=2
   # move all IRQs to specified bitmask
   for i in $(ls -d /proc/irq/*/smp_affinity); do
      irq=$(echo $i | awk -F\/ '{print $4}')
      echo "$bm" > /proc/irq/"$irq"/smp_affinity 2>/dev/null
   done
}

# Configures RPS for an ethernet NIC & maps it to 
# passed-in bitmap of CPUS
# Pass the port name as $1
# Pass the core bitmap as $2
config_rps() {
   portname=$1
   bm=$2
   if [ "x$bm" == "x0" -o "x$bm" == "x" ]; then
      bm="$TOTAL_CORES_BM"
   fi
   for i in $(ls -d /sys/class/net/"$portname"/queues/rx-*/); do
     echo "$bm" > $i/rps_cpus
   done
}

disable_rss() {
   portname="$1"
   ethtool -K "$portname" rxhash off > /dev/null 2>&1
}

disable_offloads() {
   portname="$1"
   ethtool -K "$portname" gro off > /dev/null 2>&1
   ethtool -K "$portname" gso off > /dev/null 2>&1
   ethtool -K "$portname" lro off > /dev/null 2>&1
   ethtool -K "$portname" tso off > /dev/null 2>&1
}

# Do IRQ and RPS mapping for each queue this ENA nic has
# Pass the port name as $1
config_ena_nic() {
   portname="$1"

   set_smp_affinity
   # Why disable all offloads ? We do not know how they interact 
   # with our flow-parallelization code. Switch it off for now.
   disable_offloads "$portname"
   
   # Why disable RSS ? Because on AWS we need to support GRE (non-ipsec)
   # which is not supported by ENA NIC based hashing. So disable RSS hashing
   # and rely on RPS where GRE is supported
   disable_rss "$portname"

   # Enable RPS and map it to specific cores
   config_rps "$portname" "$NON_EOS_CPU_BM"
}

# Do IRQ and RPS mapping for each queue this VIF nic has
# Pass the port name as $1
config_vif_nic() {
   portname="$1"

   set_smp_affinity

   disable_offloads "$portname"
   
   # Be safe than sorry. Who knows what this driver supports. Fallback
   # to RPS for now..
   disable_rss "$portname"

   # Enable RPS and map it to specific cores
   config_rps "$portname" "$NON_EOS_CPU_BM"
}
config_ixgbe_nic() {
   portname="$1"

   set_smp_affinity
   # Why disable all offloads ? We do not know how they interact 
   # with our flow-parallelization code. Switch it off for now.
   disable_offloads "$portname"
   
   disable_rss "$portname"

   # Enable RPS and map it to specific cores
   config_rps "$portname" "$NON_EOS_CPU_BM"
}

config_ixgbevf_nic() {
   portname="$1"

   set_smp_affinity
   disable_offloads "$portname"

   # Most ixgbevf NICs excluding x550 do not support RSS in VF mode. Just
   # disable it for all and fallback to RPS to be safe for now.
   # Revisit later when we have a way to detect x550 ( we do not have the h/w
   # to test right now.
   disable_rss "$portname"

   # Enable RPS and map it to specific cores
   config_rps "$portname" "$NON_EOS_CPU_BM"
}

# Do virtio specific stuff
# Pass the port name as $1
config_virtio_nic() {
   portname="$1"

   set_smp_affinity
   disable_offloads "$portname"

   # Most virtio in our systems do not support multiqueue so just 
   # depend on RPS. Switch this off just for safety
   disable_rss "$portname"

   # Enable RPS and map it to specific cores
   config_rps "$portname" "$NON_EOS_CPU_BM"
}

config_esx_nic() {
   portname="$1"

   set_smp_affinity
   disable_offloads "$portname"

   disable_rss "$portname"

   # Enable RPS and map it to specific cores
   config_rps "$portname" "$NON_EOS_CPU_BM"
}

config_hyperv_nic() {
   portname="$1"

   # Disable because we are seeing some issues with large packet sizes
   disable_offloads "$portname"

   # Why are we disabling RSS ? Because there is no
   # way to "ask" the hyper-V hypervisor to set smp_affinity
   # and map the affinity to a specified set of cores. So disable
   # RSS and then fallback to RPS to apply specified core mapping
   disable_rss "$portname"

   CORES=`non_eos_cores_bit_map`
   config_rps "$portname" "$CORES"

}

# Configure specific stuff needed for each port type
# Pass the port name as $1 and port type as $2
# We only handle "ena" for now.
config_eth_specific_stuff() {
   portname=$1
   porttype=$2

   case "${porttype}" in
   "ena")  echo "Configuring ${portname} with type ${porttype}"
       `config_ena_nic "$portname"`
       ;;
   "vif")  echo "Configuring ${portname} with type ${porttype}"
       `config_vif_nic "$portname"`
       ;;
   "ixgbevf")  echo "Configuring ${portname} with type ${porttype}"
       `config_ixgbevf_nic "$portname"`
       ;;
   "ixgbe")  echo "Configuring ${portname} with type ${porttype}"
       `config_ixgbe_nic "$portname"`
       ;;
   "virtio_net")  echo "Configuring ${portname} with type ${porttype}"
       `config_virtio_nic "$portname"`
       ;;
   "vmxnet3")  echo "Configuring ${portname} with type ${porttype}"
       `config_esx_nic "$portname"`
       ;;
   "hv_netvsc")  echo "Configuring ${portname} with type ${porttype}"
       `config_hyperv_nic "$portname"`
       ;;
   *) echo "Ignoring port ${portname} type ${porttype}"
      ;;
   esac
}

main() {
   PLATFORM=`is_veos`
   if [ "x$PLATFORM" == "x0" ]; then
      echo "Not vEOS platform!"
      exit 0
   fi

   # We need to keep latency under control. The fastest way to do that
   # is to reduce the size of the backlog queue in the kernel. We have 
   # observed in experiments that reducing it from current 1000 to 300
   # reduces the RTT latency from 25ms to ~7ms, under heavy ipsec load.
   echo 300 > /proc/sys/net/core/netdev_max_backlog

   for et in `find_ethernet_ports`; do
        type=`find_ethernet_porttype ${et}`
        config_eth_specific_stuff ${et} ${type}
   done
}

main
