git: eb4c4f4a2e18 - main - ena: merge ena-com v2.5.0 upgrade

From: Marcin Wojtas <mw_at_FreeBSD.org>
Date: Sun, 23 Jan 2022 19:50:30 UTC
The branch main has been updated by mw:

URL: https://cgit.FreeBSD.org/src/commit/?id=eb4c4f4a2e18659b67a6bf1ea5f473c7ed8c854f

commit eb4c4f4a2e18659b67a6bf1ea5f473c7ed8c854f
Merge: 5025e850132d 2530eb1fa01b
Author:     Marcin Wojtas <mw@FreeBSD.org>
AuthorDate: 2022-01-23 19:21:17 +0000
Commit:     Marcin Wojtas <mw@FreeBSD.org>
CommitDate: 2022-01-23 19:27:13 +0000

    ena: merge ena-com v2.5.0 upgrade
    
    Merge commit '2530eb1fa01bf28fbcfcdda58bd41e055dcb2e4a'
    
    Adjust the driver to the upgraded ena-com part twofold:
    
    First update is related to the driver's NUMA awareness.
    
    Allocate I/O queue memory in NUMA domain local to the CPU bound to the
    given queue, improving data access time. Since this can result in
    performance hit for unaware users, this is done only when RSS
    option is enabled, for other cases the driver relies on kernel to
    allocate memory by itself.
    
    Information about first CPU bound is saved in adapter structure, so
    the binding persists after bringing the interface down and up again.
    
    If there are more buckets than interface queues, the driver will try to
    bind different interfaces to different CPUs using round-robin algorithm
    (but it will not bind queues to CPUs which do not have any RSS buckets
    associated with them). This is done to better utilize hardware
    resources by spreading the load.
    
    Add (read-only) per-queue sysctls in order to provide the following
    information:
    - queueN.domain: NUMA domain associated with the queue
    - queueN.cpu:    CPU affinity of the queue
    
    The second change is for the CSUM_OFFLOAD constant, as ENA platform
    file has removed its definition. To align to that change, it has been
    added to the ena_datapath.h file.
    
    Submitted by: Artur Rojek <ar@semihalf.com>
    Submitted by: Dawid Gorecki <dgr@semihalf.com>
    Obtained from: Semihalf
    MFC after: 2 weeks
    Sponsored by: Amazon, Inc.

 share/man/man4/ena.4           |  5 +++++
 sys/contrib/ena-com/ena_plat.h | 26 ++++++++++++++++++------
 sys/dev/ena/ena.c              | 46 +++++++++++++++++++++++++++++++++++++++---
 sys/dev/ena/ena.h              |  2 ++
 sys/dev/ena/ena_datapath.h     |  2 ++
 sys/dev/ena/ena_sysctl.c       |  8 ++++++++
 6 files changed, 80 insertions(+), 9 deletions(-)

diff --cc share/man/man4/ena.4
index aacf7956c9f8,000000000000..089457fd4872
mode 100644,000000..100644
--- a/share/man/man4/ena.4
+++ b/share/man/man4/ena.4
@@@ -1,534 -1,0 +1,539 @@@
 +.\" SPDX-License-Identifier: BSD-2-Clause
 +.\"
 +.\" Copyright (c) 2015-2020 Amazon.com, Inc. or its affiliates.
 +.\" All rights reserved.
 +.\"
 +.\" Redistribution and use in source and binary forms, with or without
 +.\" modification, are permitted provided that the following conditions
 +.\" are met:
 +.\"
 +.\" 1. Redistributions of source code must retain the above copyright
 +.\"    notice, this list of conditions and the following disclaimer.
 +.\"
 +.\" 2. Redistributions in binary form must reproduce the above copyright
 +.\"    notice, this list of conditions and the following disclaimer in
 +.\"    the documentation and/or other materials provided with the
 +.\"    distribution.
 +.\"
 +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 +.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 +.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 +.\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 +.\" OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 +.\" SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 +.\" LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 +.\" OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 +.\"
 +.\" $FreeBSD$
 +.\"
 +.Dd June 4, 2021
 +.Dt ENA 4
 +.Os
 +.Sh NAME
 +.Nm ena
 +.Nd "FreeBSD kernel driver for Elastic Network Adapter (ENA) family"
 +.Sh SYNOPSIS
 +To compile this driver into the kernel,
 +place the following line in the
 +kernel configuration file:
 +.Bd -ragged -offset indent
 +.Cd "device ena"
 +.Ed
 +.Pp
 +Alternatively, to load the driver as a
 +module at boot time, place the following line in
 +.Xr loader.conf 5 :
 +.Bd -literal -offset indent
 +if_ena_load="YES"
 +.Ed
 +.Sh DESCRIPTION
 +The ENA is a networking interface designed to make good use of modern CPU
 +features and system architectures.
 +.Pp
 +The ENA device exposes a lightweight management interface with a
 +minimal set of memory mapped registers and extendable command set
 +through an Admin Queue.
 +.Pp
 +The driver supports a range of ENA devices, is link-speed independent
 +(i.e., the same driver is used for 10GbE, 25GbE, 40GbE, etc.), and has
 +a negotiated and extendable feature set.
 +.Pp
 +Some ENA devices support SR-IOV.
 +This driver is used for both the SR-IOV Physical Function (PF) and Virtual
 +Function (VF) devices.
 +.Pp
 +The ENA devices enable high speed and low overhead network traffic
 +processing by providing multiple Tx/Rx queue pairs (the maximum number
 +is advertised by the device via the Admin Queue), a dedicated MSI-X
 +interrupt vector per Tx/Rx queue pair, and CPU cacheline optimized
 +data placement.
 +.Pp
++When RSS is enabled, each Tx/Rx queue pair is bound to a corresponding
++CPU core and its NUMA domain. The order of those bindings is based on
++the RSS bucket mapping. For builds with RSS support disabled, the
++CPU and NUMA management is left to the kernel.
++.Pp
 +The
 +.Nm
 +driver supports industry standard TCP/IP offload features such
 +as checksum offload and TCP transmit segmentation offload (TSO).
 +Receive-side scaling (RSS) is supported for multi-core scaling.
 +.Pp
 +The
 +.Nm
 +driver and its corresponding devices implement health
 +monitoring mechanisms such as watchdog, enabling the device and driver
 +to recover in a manner transparent to the application, as well as
 +debug logs.
 +.Pp
 +Some of the ENA devices support a working mode called Low-latency
 +Queue (LLQ), which saves several more microseconds.
 +.Pp
 +Support for the
 +.Xr netmap 4
 +framework is provided by the
 +.Nm
 +driver.
 +Kernel must be built with the DEV_NETMAP option to be able to use this feature.
 +.Sh HARDWARE
 +Supported PCI vendor ID/device IDs:
 +.Pp
 +.Bl -bullet -compact
 +.It
 +1d0f:0ec2 - ENA PF
 +.It
 +1d0f:1ec2 - ENA PF with LLQ support
 +.It
 +1d0f:ec20 - ENA VF
 +.It
 +1d0f:ec21 - ENA VF with LLQ support
 +.El
 +.Sh LOADER TUNABLES
 +The
 +.Nm
 +driver's behavior can be changed using run-time or boot-time sysctl
 +arguments.
 +The boot-time arguments can be set at the
 +.Xr loader 8
 +prompt before booting the kernel, or stored in the
 +.Xr loader.conf 5 .
 +The run-time arguments can be set using the
 +.Xr sysctl 8
 +command.
 +.Pp
 +Boot-time tunables:
 +.Bl -tag -width indent
 +.It Va hw.ena.enable_9k_mbufs
 +Use 9k mbufs for the Rx descriptors.
 +The default is 0.
 +If the node value is set to 1, 9k mbufs will be used for the Rx buffers.
 +If set to 0, page size mbufs will be used instead.
 +.Pp
 +Using 9k buffers for Rx can improve Rx throughput, but in low memory conditions
 +it might increase allocation time, as the system has to look for 3 contiguous
 +pages.
 +This can further lead to OS instability, together with ENA driver reset and NVMe
 +timeouts.
 +If network performance is critical and memory capacity is sufficient, the 9k
 +mbufs can be used.
 +.It Va hw.ena.force_large_llq_headers
 +Force the driver to use large LLQ headers (224 bytes).
 +The default is 0.
 +If the node value is set to 0, the regular size LLQ header will be used, which
 +is 96B.
 +In some cases, the packet header can be bigger than this (for example -
 +IPv6 with multiple extensions).
 +In such a situation, the large LLQ headers should be used by setting this node
 +value to 1.
 +This will take effect only if the device supports both LLQ and large LLQ
 +headers.
 +Otherwise, it will fallback to the no LLQ mode or regular header size.
 +.Pp
 +Increasing LLQ header size reduces the size of the Tx queue by half, so it may
 +affect the number of dropped Tx packets.
 +.El
 +.Pp
 +Run-time tunables:
 +.Bl -tag -width indent
 +.It Va hw.ena.log_level
 +Controls extra logging verbosity of the driver.
 +The default is 2.
 +The higher the logging level, the more logs will be printed out. 0 means all
 +extra logs are disabled and only error logs will be printed out.
 +Default value (2) reports errors, warnings and is verbose about driver
 +operation.
 +.Pp
 +The possible flags are:
 +.Pp
 +.Bl -bullet -compact
 +.It
 +0 - ENA_ERR  - Enable driver error messages and ena_com error logs.
 +.It
 +1 - ENA_WARN - Enable logs for non-critical errors.
 +.It
 +2 - ENA_INFO - Make the driver more verbose about its actions.
 +.It
 +3 - ENA_DBG  - Enable debug logs.
 +.El
 +.Pp
 +NOTE: In order to enable logging on the Tx/Rx data path, driver must be compiled
 +with ENA_LOG_IO_ENABLE compilation flag.
 +.Pp
 +Example:
 +To enable logs for errors and warnings, the following command should be used:
 +.Bd -literal -offset indent
 +sysctl hw.ena.log_level=1
 +.Ed
 +.It Va dev.ena.X.io_queues_nb
 +Number of the currently allocated and used IO queues.
 +The default is max_num_io_queues.
 +Controls the number of IO queue pairs (Tx/Rx). As this call has to reallocate
 +the queues, it will reset the interface and restart all the queues - this means
 +that everything, which was currently held in the queue, will be lost, leading to
 +potential packet drops.
 +.Pp
 +This call can fail if the system isn't able to provide the driver with enough
 +resources.
 +In that situation, the driver will try to revert the previous number of the IO
 +queues.
 +If this also fails, the device reset will be triggered.
 +.Pp
 +Example:
 +To use only 2 Tx and Rx queues for the device ena1, the following command should
 +be used:
 +.Bd -literal -offset indent
 +sysctl dev.ena.1.io_queues_nb=2
 +.Ed
 +.It Va dev.ena.X.rx_queue_size
 +Size of the Rx queue.
 +The default is 1024.
 +Controls the number of IO descriptors for each Rx queue.
 +The user may want to increase the Rx queue size if they observe a high number of
 +Rx drops in the driver's statistics.
 +For performance reasons, the Rx queue size must be a power of 2.
 +.Pp
 +This call can fail if the system isn't able to provide the driver with enough
 +resources.
 +In that situation, the driver will try to revert to the previous number of the
 +descriptors.
 +If this also fails, the device reset will be triggered.
 +.Pp
 +Example:
 +To increase Rx ring size to 8K descriptors for the device ena0, the following
 +command should be used:
 +.Bd -literal -offset indent
 +sysctl dev.ena.0.rx_queue_size=8192
 +.Ed
 +.It Va dev.ena.X.buf_ring_size
 +Size of the Tx buffer ring (drbr).
 +The default is 4096.
 +Input must be a power of 2.
 +Controls the number of mbufs that can be held in the Tx buffer ring.
 +The drbr is used as a multiple-producer, single-consumer lockless ring for
 +buffering extra mbufs coming from the stack in case the Tx procedure is busy
 +sending the packets, or the Tx ring is full.
 +Increasing the size of the buffer ring may reduce the number of Tx packets being
 +dropped in case of a big Tx burst, which cannot be handled by the IO queue
 +immediately.
 +Each Tx queue has its own drbr.
 +.Pp
 +It is recommended to keep the drbr with at least the default value, but in case
 +the system lacks the resources, it can be reduced.
 +This call can fail if the system is not able to provide the driver with enough
 +resources.
 +In that situation, the driver will try to revert to the previous number of the
 +drbr and trigger the device reset.
 +.Pp
 +Example:
 +To set drbr size for interface ena0 to 2048, the following command should
 +be used:
 +.Bd -literal -offset indent
 +sysctl dev.ena.0.buf_ring_size=2048
 +.Ed
 +.It Va dev.ena.X.eni_metrics.sample_interval
 +Interval in seconds for updating ENI metrics.
 +The default is 0.
 +Determines how often (if ever) the ENI metrics should be updated.
 +The ENI metrics are being updated asynchronously in a timer service in order to
 +avoid admin queue overload by sysctl node reading.
 +The value in this node controls the interval between issuing admin commands to
 +the device, which will update the ENI metrics values.
 +.Pp
 +If some application is periodically monitoring the eni_metrics, then the ENI
 +metrics interval can be adjusted accordingly.
 +Value 0 turns off the update completely.
 +Value 1 is the minimum interval and is equal to 1 second.
 +The maximum allowed update interval is 1 hour.
 +.Pp
 +Example:
 +To update ENI metrics for the device ena1 every 10 seconds, the following
 +command should be used:
 +.Bd -literal -offset indent
 +sysctl dev.ena.1.eni_metrics.sample_interval=10
 +.Ed
 +.It Va dev.ena.X.rss.indir_table_size
 +RSS indirection table size.
 +The default is 128.
 +Returns the number of entries in the RSS indirection table.
 +.Pp
 +Example:
 +To read the RSS indirection table size, the following command should be used:
 +.Bd -literal -offset indent
 +sysctl dev.ena.0.rss.indir_table_size
 +.Ed
 +.It Va dev.ena.X.rss.indir_table
 +RSS indirection table mapping.
 +The default is x:y key-pairs of indir_table_size length.
 +Updates selected indices of the RSS indirection table.
 +.Pp
 +The entry string consists of one or more x:y keypairs, where x stands for
 +the table index and y for its new value. Table indices that don't need to be
 +updated can be omitted from the string and will retain their existing values.
 +.Pp
 +If an index is entered more than once, the last value is used.
 +.Pp
 +Example:
 +To update two selected indices in the RSS indirection table, e.g. setting index
 +0 to queue 5 and then index 5 to queue 0, the following command should be used:
 +.Bd -literal -offset indent
 +sysctl dev.ena.0.rss.indir_table="0:5 5:0"
 +.Ed
 +.It Va dev.ena.X.rss.key
 +RSS hash key.
 +The default is 40 bytes long randomly generated hash key.
 +Controls the RSS Toeplitz hash algorithm key value.
 +.Pp
 +Only available when driver compiled without the kernel side RSS support.
 +.Pp
 +Example:
 +To change the RSS hash key value to
 +.Pp
 +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
 +.br
 +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
 +.br
 +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
 +.br
 +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
 +.br
 +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
 +.Pp
 +the following command should be used:
 +.Bd -literal -offset indent
 +sysctl dev.ena.0.rss.key=6d5a56da255b0ec24167253d43a38fb0d0ca2bcbae7b30b477cb2da38030f20c6a42b73bbeac01fa
 +.Ed
 +.El
 +.Sh DIAGNOSTICS
 +.Ss Device initialization phase
 +.Bl -diag
 +.It ena%d: failed to init mmio read less
 +.Pp
 +Error occurred during initialization of the mmio register read request.
 +.It ena%d: Can not reset device
 +.Pp
 +Device could not be reset.
 +.br
 +Device may not be responding or is already during reset.
 +.It ena%d: device version is too low
 +.Pp
 +Version of the controller is too old and it is not supported by the driver.
 +.It ena%d: Invalid dma width value %d
 +.Pp
 +The controller is unable to request dma transaction width.
 +.br
 +Device stopped responding or it demanded invalid value.
 +.It ena%d: Can not initialize ena admin queue with device
 +.Pp
 +Initialization of the Admin Queue failed.
 +.br
 +Device may not be responding or there was a problem with initialization of
 +the resources.
 +.It ena%d: Cannot get attribute for ena device rc: %d
 +.Pp
 +Failed to get attributes of the device from the controller.
 +.It ena%d: Cannot configure aenq groups rc: %d
 +.Pp
 +Errors occurred when trying to configure AENQ groups.
 +.El
 +.Ss Driver initialization/shutdown phase
 +.Bl -diag
 +.It ena%d: PCI resource allocation failed!
 +.It ena%d: failed to pmap registers bar
 +.It ena%d: can not allocate ifnet structure
 +.It ena%d: Error with network interface setup
 +.It ena%d: Failed to enable and set the admin interrupts
 +.It ena%d: Error, MSI-X is already enabled
 +.It ena%d: Failed to enable MSIX, vectors %d rc %d
 +.It ena%d: Not enough number of MSI-X allocated: %d
 +.It ena%d: Error with MSI-X enablement
 +.It ena%d: could not allocate irq vector: %d
 +.It ena%d: unable to allocate bus resource: registers!
 +.It ena%d: unable to allocate bus resource: msix!
 +.Pp
 +Resource allocation failed when initializing the device.
 +.br
 +Driver will not be attached.
 +.It ena%d: ENA device init failed (err: %d)
 +.It ena%d: Cannot initialize device
 +.Pp
 +Device initialization failed.
 +.br
 +Driver will not be attached.
 +.It ena%d: failed to register interrupt handler for irq %ju: %d
 +.Pp
 +Error occurred when trying to register Admin Queue interrupt handler.
 +.It ena%d: Cannot setup mgmnt queue intr
 +.Pp
 +Error occurred during configuration of the Admin Queue interrupts.
 +.It ena%d: Enable MSI-X failed
 +.Pp
 +Configuration of the MSI-X for Admin Queue failed.
 +.br
 +There could be lack of resources or interrupts could not have been configured.
 +.br
 +Driver will not be attached.
 +.It ena%d: VLAN is in use, detach first
 +.Pp
 +VLANs are being used when trying to detach the driver.
 +.br
 +VLANs must be detached first and then detach routine have to be called again.
 +.It ena%d: Unmapped RX DMA tag associations
 +.It ena%d: Unmapped TX DMA tag associations
 +.Pp
 +Error occurred when trying to destroy RX/TX DMA tag.
 +.It ena%d: Cannot init indirect table
 +.It ena%d: Cannot fill indirect table
 +.It ena%d: Cannot fill hash function
 +.It ena%d: Cannot fill hash control
 +.It ena%d: WARNING: RSS was not properly initialized, it will affect bandwidth
 +.Pp
 +Error occurred during initialization of one of RSS resources.
 +.br
 +The device will work with reduced performance because all RX packets will be
 +passed to queue 0 and there will be no hash information.
 +.It ena%d: LLQ is not supported. Fallback to host mode policy.
 +.It ena%d: Failed to configure the device mode. Fallback to host mode policy.
 +.It ena%d: unable to allocate LLQ bar resource. Fallback to host mode policy.
 +.Pp
 +Error occured during Low-latency Queue mode setup.
 +.br
 +The device will work, but without the LLQ performance gain.
 +.It ena%d: failed to enable write combining.
 +.Pp
 +Error occured while setting the Write Combining mode, required for the LLQ.
 +.It ena%d: failed to tear down irq: %d
 +.It ena%d: dev has no parent while releasing res for irq: %d
 +Release of the interrupts failed.
 +.El
 +.Ss Additional diagnostic
 +.Bl -diag
 +.It ena%d: Invalid MTU setting. new_mtu: %d max_mtu: %d min mtu: %d
 +.Pp
 +Requested MTU value is not supported and will not be set.
 +.It ena%d: Failed to set MTU to %d
 +.Pp
 +This message appears when either MTU change feature is not supported, or device
 +communication error has occured.
 +.It ena%d: Keep alive watchdog timeout.
 +.Pp
 +Device stopped responding and will be reset.
 +.It ena%d: Found a Tx that wasn't completed on time, qid %d, index %d.
 +.Pp
 +Packet was pushed to the NIC but not sent within given time limit.
 +.br
 +It may be caused by hang of the IO queue.
 +.It ena%d: The number of lost tx completion is aboce the threshold (%d > %d). Reset the device
 +.Pp
 +If too many Tx weren't completed on time the device is going to be reset.
 +.br
 +It may be caused by hanged queue or device.
 +.It ena%d: Trigger reset is on
 +.Pp
 +Device will be reset.
 +.br
 +Reset is triggered either by watchdog or if too many TX packets were not
 +completed on time.
 +.It ena%d: device reset scheduled but trigger_reset is off
 +.Pp
 +Reset task has been triggered, but the driver did not request it.
 +.br
 +Device reset will not be performed.
 +.It ena%d: Device reset failed
 +.Pp
 +Error occured while trying to reset the device.
 +.It ena%d: Cannot initialize device
 +.It ena%d: Error, mac address are different
 +.It ena%d: Error, device max mtu is smaller than ifp MTU
 +.It ena%d: Validation of device parameters failed
 +.It ena%d: Enable MSI-X failed
 +.It ena%d: Failed to create I/O queues
 +.It ena%d: Reset attempt failed. Can not reset the device
 +.Pp
 +Error occured while trying to restore the device after reset.
 +.It ena%d: Device reset completed successfully, Driver info: %s
 +.Pp
 +Device has been correctly restored after reset and is ready to use.
 +.It ena%d: Allocation for Tx Queue %u failed
 +.It ena%d: Allocation for Rx Queue %u failed
 +.It ena%d: Unable to create Rx DMA map for buffer %d
 +.It ena%d: Failed to create io TX queue #%d rc: %d
 +.It ena%d: Failed to get TX queue handlers. TX queue num %d rc: %d
 +.It ena%d: Failed to create io RX queue[%d] rc: %d
 +.It ena%d: Failed to get RX queue handlers. RX queue num %d rc: %d
 +.It ena%d: could not allocate irq vector: %d
 +.It ena%d: failed to register interrupt handler for irq %ju: %d
 +.Pp
 +IO resources initialization failed.
 +.br
 +Interface will not be brought up.
 +.It ena%d: LRO[%d] Initialization failed!
 +.Pp
 +Initialization of the LRO for the RX ring failed.
 +.It ena%d: failed to alloc buffer for rx queue
 +.It ena%d: failed to add buffer for rx queue %d
 +.It ena%d: refilled rx qid %d with only %d mbufs (from %d)
 +.Pp
 +Allocation of resources used on RX path failed.
 +.br
 +If happened during initialization of the IO queue, the interface will not be
 +brought up.
 +.It ena%d: NULL mbuf in rx_info
 +.Pp
 +Error occured while assembling mbuf from descriptors.
 +.It ena%d: tx_info doesn't have valid mbuf
 +.It ena%d: Invalid req_id: %hu
 +.It ena%d: failed to prepare tx bufs
 +.Pp
 +Error occured while preparing a packet for transmission.
 +.It ena%d: ioctl promisc/allmulti
 +.Pp
 +IOCTL request for the device to work in promiscuous/allmulti mode.
 +.br
 +See
 +.Xr ifconfig 8
 +for more details.
 +.El
 +.Sh SUPPORT
 +If an issue is identified with the released source code with a supported
 +adapter, please email the specific information related to the issue to
 +.Aq Mt mk@semihalf.com ,
 +.Aq Mt ar@semihalf.com
 +and
 +.Aq Mt mw@semihalf.com .
 +.Sh SEE ALSO
 +.Xr netmap 4 ,
 +.Xr vlan 4 ,
 +.Xr ifconfig 8
 +.Sh HISTORY
 +The
 +.Nm
 +driver first appeared in
 +.Fx 11.1 .
 +.Sh AUTHORS
 +The
 +.Nm
 +driver was written by
 +.An Semihalf .
diff --cc sys/dev/ena/ena.c
index 84ef234cd937,000000000000..63b4598a9352
mode 100644,000000..100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@@ -1,3880 -1,0 +1,3920 @@@
 +/*-
 + * SPDX-License-Identifier: BSD-2-Clause
 + *
 + * Copyright (c) 2015-2021 Amazon.com, Inc. or its affiliates.
 + * All rights reserved.
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + *
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + *
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 + */
 +#include <sys/cdefs.h>
 +__FBSDID("$FreeBSD$");
 +
 +#include "opt_rss.h"
 +
 +#include <sys/param.h>
 +#include <sys/systm.h>
 +#include <sys/bus.h>
 +#include <sys/endian.h>
 +#include <sys/kernel.h>
 +#include <sys/kthread.h>
 +#include <sys/malloc.h>
 +#include <sys/mbuf.h>
 +#include <sys/module.h>
 +#include <sys/rman.h>
 +#include <sys/smp.h>
 +#include <sys/socket.h>
 +#include <sys/sockio.h>
 +#include <sys/sysctl.h>
 +#include <sys/taskqueue.h>
 +#include <sys/time.h>
 +#include <sys/eventhandler.h>
 +
 +#include <machine/bus.h>
 +#include <machine/resource.h>
 +#include <machine/in_cksum.h>
 +
 +#include <net/bpf.h>
 +#include <net/ethernet.h>
 +#include <net/if.h>
 +#include <net/if_var.h>
 +#include <net/if_arp.h>
 +#include <net/if_dl.h>
 +#include <net/if_media.h>
 +#include <net/if_types.h>
 +#include <net/if_vlan_var.h>
 +
 +#include <netinet/in_systm.h>
 +#include <netinet/in.h>
 +#include <netinet/if_ether.h>
 +#include <netinet/ip.h>
 +#include <netinet/ip6.h>
 +#include <netinet/tcp.h>
 +#include <netinet/udp.h>
 +
 +#include <dev/pci/pcivar.h>
 +#include <dev/pci/pcireg.h>
 +
 +#include <vm/vm.h>
 +#include <vm/pmap.h>
 +
 +#include "ena_datapath.h"
 +#include "ena.h"
 +#include "ena_sysctl.h"
 +#include "ena_rss.h"
 +
 +#ifdef DEV_NETMAP
 +#include "ena_netmap.h"
 +#endif /* DEV_NETMAP */
 +
 +/*********************************************************
 + *  Function prototypes
 + *********************************************************/
 +static int	ena_probe(device_t);
 +static void	ena_intr_msix_mgmnt(void *);
 +static void	ena_free_pci_resources(struct ena_adapter *);
 +static int	ena_change_mtu(if_t, int);
 +static inline void ena_alloc_counters(counter_u64_t *, int);
 +static inline void ena_free_counters(counter_u64_t *, int);
 +static inline void ena_reset_counters(counter_u64_t *, int);
 +static void	ena_init_io_rings_common(struct ena_adapter *,
 +    struct ena_ring *, uint16_t);
 +static void	ena_init_io_rings_basic(struct ena_adapter *);
 +static void	ena_init_io_rings_advanced(struct ena_adapter *);
 +static void	ena_init_io_rings(struct ena_adapter *);
 +static void	ena_free_io_ring_resources(struct ena_adapter *, unsigned int);
 +static void	ena_free_all_io_rings_resources(struct ena_adapter *);
 +static int	ena_setup_tx_dma_tag(struct ena_adapter *);
 +static int	ena_free_tx_dma_tag(struct ena_adapter *);
 +static int	ena_setup_rx_dma_tag(struct ena_adapter *);
 +static int	ena_free_rx_dma_tag(struct ena_adapter *);
 +static void	ena_release_all_tx_dmamap(struct ena_ring *);
 +static int	ena_setup_tx_resources(struct ena_adapter *, int);
 +static void	ena_free_tx_resources(struct ena_adapter *, int);
 +static int	ena_setup_all_tx_resources(struct ena_adapter *);
 +static void	ena_free_all_tx_resources(struct ena_adapter *);
 +static int	ena_setup_rx_resources(struct ena_adapter *, unsigned int);
 +static void	ena_free_rx_resources(struct ena_adapter *, unsigned int);
 +static int	ena_setup_all_rx_resources(struct ena_adapter *);
 +static void	ena_free_all_rx_resources(struct ena_adapter *);
 +static inline int ena_alloc_rx_mbuf(struct ena_adapter *, struct ena_ring *,
 +    struct ena_rx_buffer *);
 +static void	ena_free_rx_mbuf(struct ena_adapter *, struct ena_ring *,
 +    struct ena_rx_buffer *);
 +static void	ena_free_rx_bufs(struct ena_adapter *, unsigned int);
 +static void	ena_refill_all_rx_bufs(struct ena_adapter *);
 +static void	ena_free_all_rx_bufs(struct ena_adapter *);
 +static void	ena_free_tx_bufs(struct ena_adapter *, unsigned int);
 +static void	ena_free_all_tx_bufs(struct ena_adapter *);
 +static void	ena_destroy_all_tx_queues(struct ena_adapter *);
 +static void	ena_destroy_all_rx_queues(struct ena_adapter *);
 +static void	ena_destroy_all_io_queues(struct ena_adapter *);
 +static int	ena_create_io_queues(struct ena_adapter *);
 +static int	ena_handle_msix(void *);
 +static int	ena_enable_msix(struct ena_adapter *);
 +static void	ena_setup_mgmnt_intr(struct ena_adapter *);
 +static int	ena_setup_io_intr(struct ena_adapter *);
 +static int	ena_request_mgmnt_irq(struct ena_adapter *);
 +static int	ena_request_io_irq(struct ena_adapter *);
 +static void	ena_free_mgmnt_irq(struct ena_adapter *);
 +static void	ena_free_io_irq(struct ena_adapter *);
 +static void	ena_free_irqs(struct ena_adapter*);
 +static void	ena_disable_msix(struct ena_adapter *);
 +static void	ena_unmask_all_io_irqs(struct ena_adapter *);
 +static int	ena_up_complete(struct ena_adapter *);
 +static uint64_t	ena_get_counter(if_t, ift_counter);
 +static int	ena_media_change(if_t);
 +static void	ena_media_status(if_t, struct ifmediareq *);
 +static void	ena_init(void *);
 +static int	ena_ioctl(if_t, u_long, caddr_t);
 +static int	ena_get_dev_offloads(struct ena_com_dev_get_features_ctx *);
 +static void	ena_update_host_info(struct ena_admin_host_info *, if_t);
 +static void	ena_update_hwassist(struct ena_adapter *);
 +static int	ena_setup_ifnet(device_t, struct ena_adapter *,
 +    struct ena_com_dev_get_features_ctx *);
 +static int	ena_enable_wc(device_t, struct resource *);
 +static int	ena_set_queues_placement_policy(device_t, struct ena_com_dev *,
 +    struct ena_admin_feature_llq_desc *, struct ena_llq_configurations *);
 +static uint32_t	ena_calc_max_io_queue_num(device_t, struct ena_com_dev *,
 +    struct ena_com_dev_get_features_ctx *);
 +static int	ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *);
 +static void	ena_config_host_info(struct ena_com_dev *, device_t);
 +static int	ena_attach(device_t);
 +static int	ena_detach(device_t);
 +static int	ena_device_init(struct ena_adapter *, device_t,
 +    struct ena_com_dev_get_features_ctx *, int *);
 +static int	ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *);
 +static void ena_update_on_link_change(void *, struct ena_admin_aenq_entry *);
 +static void	unimplemented_aenq_handler(void *,
 +    struct ena_admin_aenq_entry *);
 +static int	ena_copy_eni_metrics(struct ena_adapter *);
 +static void	ena_timer_service(void *);
 +
 +static char ena_version[] = DEVICE_NAME DRV_MODULE_NAME " v" DRV_MODULE_VERSION;
 +
 +static ena_vendor_info_t ena_vendor_info_array[] = {
 +    { PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_PF, 0},
 +    { PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_PF_RSERV0, 0},
 +    { PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_VF, 0},
 +    { PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_VF_RSERV0, 0},
 +    /* Last entry */
 +    { 0, 0, 0 }
 +};
 +
 +struct sx ena_global_lock;
 +
 +/*
 + * Contains pointers to event handlers, e.g. link state chage.
 + */
 +static struct ena_aenq_handlers aenq_handlers;
 +
 +void
 +ena_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 +{
 +	if (error != 0)
 +		return;
 +	*(bus_addr_t *) arg = segs[0].ds_addr;
 +}
 +
 +int
 +ena_dma_alloc(device_t dmadev, bus_size_t size,
-     ena_mem_handle_t *dma, int mapflags, bus_size_t alignment)
++    ena_mem_handle_t *dma, int mapflags, bus_size_t alignment, int domain)
 +{
 +	struct ena_adapter* adapter = device_get_softc(dmadev);
 +	device_t pdev = adapter->pdev;
 +	uint32_t maxsize;
 +	uint64_t dma_space_addr;
 +	int error;
 +
 +	maxsize = ((size - 1) / PAGE_SIZE + 1) * PAGE_SIZE;
 +
 +	dma_space_addr = ENA_DMA_BIT_MASK(adapter->dma_width);
 +	if (unlikely(dma_space_addr == 0))
 +		dma_space_addr = BUS_SPACE_MAXADDR;
 +
 +	error = bus_dma_tag_create(bus_get_dma_tag(dmadev), /* parent */
 +	    alignment, 0,     /* alignment, bounds 		*/
 +	    dma_space_addr,   /* lowaddr of exclusion window	*/
 +	    BUS_SPACE_MAXADDR,/* highaddr of exclusion window	*/
 +	    NULL, NULL,	      /* filter, filterarg 		*/
 +	    maxsize,	      /* maxsize 			*/
 +	    1,		      /* nsegments 			*/
 +	    maxsize,	      /* maxsegsize 			*/
 +	    BUS_DMA_ALLOCNOW, /* flags 				*/
 +	    NULL,	      /* lockfunc 			*/
 +	    NULL,	      /* lockarg 			*/
 +	    &dma->tag);
 +	if (unlikely(error != 0)) {
 +		ena_log(pdev, ERR, "bus_dma_tag_create failed: %d\n", error);
 +		goto fail_tag;
 +	}
 +
++	error = bus_dma_tag_set_domain(dma->tag, domain);
++	if (unlikely(error != 0)) {
++		ena_log(pdev, ERR, "bus_dma_tag_set_domain failed: %d\n",
++		    error);
++		goto fail_map_create;
++	}
++
 +	error = bus_dmamem_alloc(dma->tag, (void**) &dma->vaddr,
 +	    BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->map);
 +	if (unlikely(error != 0)) {
 +		ena_log(pdev, ERR, "bus_dmamem_alloc(%ju) failed: %d\n",
 +		    (uintmax_t)size, error);
 +		goto fail_map_create;
 +	}
 +
 +	dma->paddr = 0;
 +	error = bus_dmamap_load(dma->tag, dma->map, dma->vaddr,
 +	    size, ena_dmamap_callback, &dma->paddr, mapflags);
 +	if (unlikely((error != 0) || (dma->paddr == 0))) {
 +		ena_log(pdev, ERR, "bus_dmamap_load failed: %d\n", error);
 +		goto fail_map_load;
 +	}
 +
 +	bus_dmamap_sync(dma->tag, dma->map,
 +	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 +
 +	return (0);
 +
 +fail_map_load:
 +	bus_dmamem_free(dma->tag, dma->vaddr, dma->map);
 +fail_map_create:
 +	bus_dma_tag_destroy(dma->tag);
 +fail_tag:
 +	dma->tag = NULL;
 +	dma->vaddr = NULL;
 +	dma->paddr = 0;
 +
 +	return (error);
 +}
 +
 +static void
 +ena_free_pci_resources(struct ena_adapter *adapter)
 +{
 +	device_t pdev = adapter->pdev;
 +
 +	if (adapter->memory != NULL) {
 +		bus_release_resource(pdev, SYS_RES_MEMORY,
 +		    PCIR_BAR(ENA_MEM_BAR), adapter->memory);
 +	}
 +
 +	if (adapter->registers != NULL) {
 +		bus_release_resource(pdev, SYS_RES_MEMORY,
 +		    PCIR_BAR(ENA_REG_BAR), adapter->registers);
 +	}
 +
 +	if (adapter->msix != NULL) {
 +		bus_release_resource(pdev, SYS_RES_MEMORY,
 +		    adapter->msix_rid, adapter->msix);
 +	}
 +}
 +
 +static int
 +ena_probe(device_t dev)
 +{
 +	ena_vendor_info_t *ent;
 +	char		adapter_name[60];
 +	uint16_t	pci_vendor_id = 0;
 +	uint16_t	pci_device_id = 0;
 +
 +	pci_vendor_id = pci_get_vendor(dev);
 +	pci_device_id = pci_get_device(dev);
 +
 +	ent = ena_vendor_info_array;
 +	while (ent->vendor_id != 0) {
 +		if ((pci_vendor_id == ent->vendor_id) &&
 +		    (pci_device_id == ent->device_id)) {
 +			ena_log_raw(DBG, "vendor=%x device=%x\n",
 +			    pci_vendor_id, pci_device_id);
 +
 +			sprintf(adapter_name, DEVICE_DESC);
 +			device_set_desc_copy(dev, adapter_name);
 +			return (BUS_PROBE_DEFAULT);
 +		}
 +
 +		ent++;
 +
 +	}
 +
 +	return (ENXIO);
 +}
 +
 +static int
 +ena_change_mtu(if_t ifp, int new_mtu)
 +{
 +	struct ena_adapter *adapter = if_getsoftc(ifp);
 +	device_t pdev = adapter->pdev;
 +	int rc;
 +
 +	if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) {
 +		ena_log(pdev, ERR, "Invalid MTU setting. "
 +		    "new_mtu: %d max mtu: %d min mtu: %d\n",
 +		    new_mtu, adapter->max_mtu, ENA_MIN_MTU);
 +		return (EINVAL);
 +	}
 +
 +	rc = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu);
 +	if (likely(rc == 0)) {
 +		ena_log(pdev, DBG, "set MTU to %d\n", new_mtu);
 +		if_setmtu(ifp, new_mtu);
 +	} else {
 +		ena_log(pdev, ERR, "Failed to set MTU to %d\n", new_mtu);
 +	}
 +
 +	return (rc);
 +}
 +
 +static inline void
 +ena_alloc_counters(counter_u64_t *begin, int size)
 +{
 +	counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
 +
 +	for (; begin < end; ++begin)
 +		*begin = counter_u64_alloc(M_WAITOK);
 +}
 +
 +static inline void
 +ena_free_counters(counter_u64_t *begin, int size)
 +{
 +	counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
 +
 +	for (; begin < end; ++begin)
 +		counter_u64_free(*begin);
 +}
 +
 +static inline void
 +ena_reset_counters(counter_u64_t *begin, int size)
 +{
 +	counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
 +
 +	for (; begin < end; ++begin)
 +		counter_u64_zero(*begin);
 +}
 +
 +static void
 +ena_init_io_rings_common(struct ena_adapter *adapter, struct ena_ring *ring,
 +    uint16_t qid)
 +{
*** 5072 LINES SKIPPED ***