# Percentage of available system memory which when dirty then system can start writing data to the disks
# NOTE: The total available memory is not equal to total system memory
vm.dirty_background_ratio =5
# Percentage of available system memory which when dirty, the process doing writes would block and write out dirty pages to the disks
vm.dirty_ratio =30
# Define when dirty inode is old enough to be eligible for writeback by the kernel flusher threads & interval to wakeup dirtytime_writeback thread
vm.dirty_expire_centisecs =1000
# Period between each wake up and write old data out to disk
vm.dirty_writeback_centisecs =100
# Reduce swapping and keep memory pages in physical memory
vm.swappiness =10
# Approaches to reclaim memory when a zone runs out of memory
# Disabled on workload that benefit from having their data cached
# Enable on workload that is partitioned such that each partition fits within a NUMA node and that accessing remote memory would cause a measurable performance reduction
# -Interrupt: Interrupt is a hardware mechanism in which, the device notices the CPU that it requires its attention./
# Interrupt can take place at any time. So when CPU gets an interrupt signal trough the indication interrupt-request line,/
# CPU stops the current process and respond to the interrupt by passing the control to interrupt handler which services device.
# -Polling: In polling is not a hardware mechanism, its a protocol in which CPU steadily checks whether the device needs attention./
# Wherever device tells process unit that it desires hardware processing, #in polling process unit keeps asking the I/O device whether or not it desires CPU processing./
# The CPU ceaselessly check every and each device hooked up thereto for sleuthing whether or not any device desires hardware attention.
# The Linux kernel uses the interrupt-driven mode by default and only switches to polling mode when the flow of incoming packets exceeds "net.core.dev_weight" number of data frames
# The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable
#net.core.dev_weight = 64
# Scales the maximum number of packets that can be processed during a RX softirq cycle. Calculation is based on dev_weight (dev_weight * dev_weight_rx_bias)
#net.core.dev_weight_rx_bias = 1
# Scales the maximum number of packets that can be processed during a TX softirq cycle. Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias)
#net.core.dev_weight_tx_bias = 1
# NOTE: If the second column of "cat /proc/net/softnet_stat" is huge, there are frame drops and it might be wise to increase the value of net.core.netdev_max_backlog/
#If the third column increases, there are SoftIRQ Misses and it might be wise to increase either or both net.core.netdev_budget and net.core.netdev_budget_usecs
# Maximum number of packets taken from all interfaces in one polling cycle (NAPI poll).
net.core.netdev_budget =50000
# Maximum number of microseconds in one polling cycle (NAPI poll).
# NOTE: Could reduce if you have a CPU with high single core performance, NIC that supports RSS
# NOTE: Setting a high number might cause CPU to stall and end in poor overall performance
net.core.netdev_budget_usecs =8000
# Maximum number of packets, queued on the INPUT side, when the interface receives packets faster than kernel can process them
net.core.netdev_max_backlog =100000
# Low latency busy poll timeout for socket reads
# NOTE: Not supported by most NIC
#net.core.busy_read=50
# Low latency busy poll timeout for poll and select
# NOTE: Not supported by most NIC
#net.core.busy_poll=50
# Receive socket buffer size
net.core.rmem_default =16777216
net.core.rmem_max =67108864
# Send socket buffer size
net.core.wmem_default =16777216
net.core.wmem_max =67108864
# Maximum ancillary buffer size allowed per socket
# Time, in seconds, that cached PMTU information is kept
net.ipv4.route.mtu_expires =1800
# Lowest possible mss setting, actuall advertised MSS depends on the first hop route MTU
net.ipv4.route.min_adv_mss =536
# Set PMTU to this value if fragmentation-required ICMP is received for that destination
# NOTE: Only necessary if "net.ipv4.ip_no_pmtu_disc" is set to mode 1
#net.ipv4.route.min_pmtu = 1500
## IP
# System IP port limits
net.ipv4.ip_local_port_range =102465535
# Allow Path MTU Discovery
net.ipv4.ip_no_pmtu_disc =0
## ARP table settings
# The maximum number of bytes which may be used by packets queued for each unresolved address by other network layers
net.ipv4.neigh.default.unres_qlen_bytes =16777216
# The maximum number of packets which may be queued for each unresolved address by other network layers
# NOTE: Deprecated in Linux 3.3 : use unres_qlen_bytes instead
#net.ipv4.neigh.default.unres_qlen = 1024
## TCP variables
# Maximum queue length of completely established sockets waiting to be accepted
net.core.somaxconn =500000
#Maximum queue length of incomplete sockets i.e. half-open connection
#NOTE: THis value should not be above "net.core.somaxconn", since that is also a hard open limit of maximum queue length of incomplete sockets/
#Kernel will take the lower one out of two as the maximum queue length of incomplete sockets
net.ipv4.tcp_max_syn_backlog =500000
# Recover and handle all requests instead of resetting them when system is overflowed with a burst of new connection attempts
net.ipv4.tcp_abort_on_overflow =0
# Maximal number of TCP sockets not attached to any user file handle (i.e. orphaned connections), held by system.
# NOTE: each orphan eats up to ~64K of unswappable memory
net.ipv4.tcp_max_orphans =262144
# Maximal number of time-wait sockets held by system simultaneously
net.ipv4.tcp_max_tw_buckets =10000
# Enable Path MTU Discovery, and use initial MSS of tcp_base_mss
net.ipv4.tcp_mtu_probing =2
# Starting MSS used in Path MTU discovery
net.ipv4.tcp_base_mss =1460
# Minimum MSS used in connection, cap it to this value even if advertised ADVMSS option is even lower
net.ipv4.tcp_min_snd_mss =536
# Enable select acknowledgments
net.ipv4.tcp_sack =1
# Send SACK more frequently
net.ipv4.tcp_comp_sack_delay_ns =2500000
# Reduce SACK that can be compressed
net.ipv4.tcp_comp_sack_nr =10
# Allows TCP to send "duplicate" SACKs
net.ipv4.tcp_dsack =1
# Enable Early Retransmit. ER lowers the threshold for triggering fast retransmit when the amount of outstanding data is small and when no previously unsent data can be transmitted
# Default Value
#net.ipv4.tcp_early_retrans = 3
# Disable ECN totally
net.ipv4.tcp_ecn =0
# Enable Forward Acknowledgment
# NOTE: This is a legacy option, it has no effect anymore
# net.ipv4.tcp_fack = 1
# TCP buffer size
# Values are measured in memory pages. Size of memory pages can be found by "getconf PAGESIZE". Normally it is 4096 bytes
# Vector of 3 INTEGERs: min, pressure, max
# min: below this number of pages TCP is not bothered about its
# memory appetite.
#
# pressure: when amount of memory allocated by TCP exceeds this number
# of pages, TCP moderates its memory consumption and enters memory
# pressure mode, which is exited when memory consumption falls
# under "min".
#
# max: number of pages allowed for queuing by all TCP sockets
net.ipv4.tcp_mem =26214415728642097152
# TCP sockets receive buffer
# Vector of 3 INTEGERs: min, default, max
# min: Minimal size of receive buffer used by TCP sockets.
# It is guaranteed to each TCP socket, even under moderate memory
# pressure.
#
# default: initial size of receive buffer used by TCP sockets.
# This value overrides net.core.rmem_default used by other protocols.
#
# max: maximal size of receive buffer allowed for automatically
# selected receiver buffers for TCP socket. This value does not override
# net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables
# automatic tuning of that socket's receive buffer size, in which
# case this value is ignored.
net.ipv4.tcp_rmem =41943041677721667108864
# Disable receive buffer auto-tuning
net.ipv4.tcp_moderate_rcvbuf =0
# Distribution of socket receive buffer space between TCP window size(this is the size of the receive window advertised to the other end), and application buffer/
#The overhead (application buffer) is counted as bytes/2^tcp_adv_win_scale i.e. Setting this 2 would mean we use 1/4 of socket buffer space as overhead
# NOTE: Overhead reduces the effective window size, which in turn reduces the maximum possible data in flight which is window size*RTT
# NOTE: Overhead helps isolating the network from scheduling and application latencies
net.ipv4.tcp_adv_win_scale =2
# Max reserved byte of TCP window for application buffer. The value will be between window/2^tcp_app_win and mss
# See "https://www.programmersought.com/article/75001203063/" for more detail about tcp_app_win & tcp_adv_win_scale
# NOTE: This application buffer is different from the one assigned by tcp_adv_win_scale
# Default
#net.ipv4.tcp_app_win = 31
# TCP sockets send buffer
# Vector of 3 INTEGERs: min, default, max
# min: Amount of memory reserved for send buffers for TCP sockets.
# Each TCP socket has rights to use it due to fact of its birth.
#
# default: initial size of send buffer used by TCP sockets. This
# value overrides net.core.wmem_default used by other protocols.
# It is usually lower than net.core.wmem_default.
#
# max: Maximal amount of memory allowed for automatically tuned
# send buffers for TCP sockets. This value does not override
# net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables
# automatic tuning of that socket's send buffer size, in which case
# this value is ignored.
net.ipv4.tcp_wmem =41943041677721667108864
# Reordering level of packets in a TCP stream
# NOTE: Reordering is costly but it happens quite a lot. Instead of declaring packet lost and requiring retransmit, try harder to reorder first
# Initial reordering level of packets in a TCP stream. TCP stack can then dynamically adjust flow reordering level between this initial value and tcp_max_reordering
net.ipv4.tcp_reordering =10
# Maximal reordering level of packets in a TCP stream
net.ipv4.tcp_max_reordering =600
# Number of times SYNACKs for a passive TCP connection attempt will be retransmitted
net.ipv4.tcp_synack_retries =10
# Number of times initial SYNs for an active TCP connection attempt will be retransmitted
net.ipv4.tcp_syn_retries =7
# In seconds, time default value for connections to keep alive
net.ipv4.tcp_keepalive_time =7200
# How many keepalive probes TCP sends out, until it decides that the connection is broken
net.ipv4.tcp_keepalive_probes =15
# In seconds, how frequently the probes are send out
net.ipv4.tcp_keepalive_intvl =60
# Number of retries before killing a TCP connection
# Time, after which TCP decides, that something is wrong due to unacknowledged RTO retransmissions, and reports this suspicion to the network layer.
net.ipv4.tcp_retries1 =3
# Time, after which TCP decides to timeout the TCP connection, when RTO retransmissions remain unacknowledged
net.ipv4.tcp_retries2 =10
# How many times to retry to kill connections on the other side before killing it on our own side
net.ipv4.tcp_orphan_retries =2
#Disable TCP auto corking, as it needlessly increasing latency when the application doesn't expect to send more data
net.ipv4.tcp_autocorking =0
# Disables Forward RTO-Recovery, since we are not operating on a lossy wireless network
net.ipv4.tcp_frto =0
# Protect Against TCP TIME-WAIT Assassination
net.ipv4.tcp_rfc1337 =1
# Avoid falling back to slow start after a connection goes idle
net.ipv4.tcp_slow_start_after_idle =0
# Enable both client support & server support of TCP Fast Open
net.ipv4.tcp_fastopen =3
# Disable timestamps
net.ipv4.tcp_timestamps =0
# Keep sockets in the state FIN-WAIT-2 for ultra short period if we were the one closing the socket, because this gives us no benefit and eats up memory
net.ipv4.tcp_fin_timeout =5
# Do not cache metrics on closing connections
net.ipv4.tcp_no_metrics_save =1
# Enable reuse of TIME-WAIT sockets for new connections
net.ipv4.tcp_tw_reuse =1
# Allows the use of a large window (> 64 kB) on a TCP connection
net.ipv4.tcp_window_scaling =1
# Set maximum window size to MAX_TCP_WINDOW i.e. 32767 in times there is no received window scaling option
net.ipv4.tcp_workaround_signed_windows =1
# The maximum amount of unsent bytes in TCP socket write queue
net.ipv4.tcp_notsent_lowat =983040
# Controls the amount of data in the Qdisc queue or device queue
net.ipv4.tcp_limit_output_bytes =3276800
# Controls a per TCP socket cache of one socket buffer