/*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2024 Amazon.com, Inc. or its affiliates. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "opt_rss.h" #include "ena_rss.h" #include "ena_sysctl.h" static void ena_sysctl_add_wd(struct ena_adapter *); static void ena_sysctl_add_stats(struct ena_adapter *); static void ena_sysctl_add_eni_metrics(struct ena_adapter *); static void ena_sysctl_add_customer_metrics(struct ena_adapter *); static void ena_sysctl_add_srd_info(struct ena_adapter *); static void ena_sysctl_add_tuneables(struct ena_adapter *); static void ena_sysctl_add_irq_affinity(struct ena_adapter *); /* Kernel option RSS prevents manipulation of key hash and indirection table. */ #ifndef RSS static void ena_sysctl_add_rss(struct ena_adapter *); #endif static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS); static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS); static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS); static int ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS); static int ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS); static int ena_sysctl_metrics_interval(SYSCTL_HANDLER_ARGS); #ifndef RSS static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS); static int ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS); #endif /* Limit max ENA sample rate to be an hour. */ #define ENA_METRICS_MAX_SAMPLE_INTERVAL 3600 #define ENA_HASH_KEY_MSG_SIZE (ENA_HASH_KEY_SIZE * 2 + 1) #define SYSCTL_GSTRING_LEN 128 #define ENA_METRIC_ENI_ENTRY(stat, desc) { \ .name = #stat, \ .description = #desc, \ } #define ENA_STAT_ENTRY(stat, desc, stat_type) { \ .name = #stat, \ .description = #desc, \ .stat_offset = offsetof(struct ena_admin_##stat_type, stat) / sizeof(u64), \ } #define ENA_STAT_ENA_SRD_ENTRY(stat, desc) \ ENA_STAT_ENTRY(stat, desc, ena_srd_stats) struct ena_hw_metrics { char name[SYSCTL_GSTRING_LEN]; char description[SYSCTL_GSTRING_LEN]; }; struct ena_srd_metrics { char name[SYSCTL_GSTRING_LEN]; char description[SYSCTL_GSTRING_LEN]; int stat_offset; }; static const struct ena_srd_metrics ena_srd_stats_strings[] = { ENA_STAT_ENA_SRD_ENTRY( ena_srd_tx_pkts, Number of packets transmitted over ENA SRD), ENA_STAT_ENA_SRD_ENTRY( ena_srd_eligible_tx_pkts, Number of packets transmitted or could have been transmitted over ENA SRD), ENA_STAT_ENA_SRD_ENTRY( ena_srd_rx_pkts, Number of packets received over ENA SRD), ENA_STAT_ENA_SRD_ENTRY( ena_srd_resource_utilization, Percentage of the ENA SRD resources that are in use), }; static const struct ena_hw_metrics ena_hw_stats_strings[] = { ENA_METRIC_ENI_ENTRY( bw_in_allowance_exceeded, Inbound BW allowance exceeded), ENA_METRIC_ENI_ENTRY( bw_out_allowance_exceeded, Outbound BW allowance exceeded), ENA_METRIC_ENI_ENTRY( pps_allowance_exceeded, PPS allowance exceeded), ENA_METRIC_ENI_ENTRY( conntrack_allowance_exceeded, Connection tracking allowance exceeded), ENA_METRIC_ENI_ENTRY( linklocal_allowance_exceeded, Linklocal packet rate allowance), ENA_METRIC_ENI_ENTRY( conntrack_allowance_available, Number of available conntracks), }; #ifndef ARRAY_SIZE #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) #endif #define ENA_CUSTOMER_METRICS_ARRAY_SIZE ARRAY_SIZE(ena_hw_stats_strings) #define ENA_SRD_METRICS_ARRAY_SIZE ARRAY_SIZE(ena_srd_stats_strings) static SYSCTL_NODE(_hw, OID_AUTO, ena, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "ENA driver parameters"); /* * Logging level for changing verbosity of the output */ int ena_log_level = ENA_INFO; SYSCTL_INT(_hw_ena, OID_AUTO, log_level, CTLFLAG_RWTUN, &ena_log_level, 0, "Logging level indicating verbosity of the logs"); SYSCTL_CONST_STRING(_hw_ena, OID_AUTO, driver_version, CTLFLAG_RD, ENA_DRV_MODULE_VERSION, "ENA driver version"); /* * Use 9k mbufs for the Rx buffers. Default to 0 (use page size mbufs instead). * Using 9k mbufs in low memory conditions might cause allocation to take a lot * of time and lead to the OS instability as it needs to look for the contiguous * pages. * However, page size mbufs has a bit smaller throughput than 9k mbufs, so if * the network performance is the priority, the 9k mbufs can be used. */ int ena_enable_9k_mbufs = 0; SYSCTL_INT(_hw_ena, OID_AUTO, enable_9k_mbufs, CTLFLAG_RDTUN, &ena_enable_9k_mbufs, 0, "Use 9 kB mbufs for Rx descriptors"); /* * Force the driver to use large or regular LLQ (Low Latency Queue) header size. * Defaults to ENA_LLQ_HEADER_SIZE_POLICY_DEFAULT. This option may be * important for platforms, which often handle packet headers on Tx with total * header size greater than 96B, as it may reduce the latency. * It also reduces the maximum Tx queue size by half, so it may cause more Tx * packet drops. */ int ena_force_large_llq_header = ENA_LLQ_HEADER_SIZE_POLICY_DEFAULT; SYSCTL_INT(_hw_ena, OID_AUTO, force_large_llq_header, CTLFLAG_RDTUN, &ena_force_large_llq_header, 0, "Change default LLQ entry size received from the device"); int ena_rss_table_size = ENA_RX_RSS_TABLE_SIZE; int ena_sysctl_allocate_customer_metrics_buffer(struct ena_adapter *adapter) { int rc = 0; adapter->customer_metrics_array = malloc((sizeof(u64) * ENA_CUSTOMER_METRICS_ARRAY_SIZE), M_DEVBUF, M_NOWAIT | M_ZERO); if (unlikely(adapter->customer_metrics_array == NULL)) rc = ENOMEM; return rc; } void ena_sysctl_add_nodes(struct ena_adapter *adapter) { struct ena_com_dev *dev = adapter->ena_dev; if (ena_com_get_cap(dev, ENA_ADMIN_CUSTOMER_METRICS)) ena_sysctl_add_customer_metrics(adapter); else if (ena_com_get_cap(dev, ENA_ADMIN_ENI_STATS)) ena_sysctl_add_eni_metrics(adapter); if (ena_com_get_cap(adapter->ena_dev, ENA_ADMIN_ENA_SRD_INFO)) ena_sysctl_add_srd_info(adapter); ena_sysctl_add_wd(adapter); ena_sysctl_add_stats(adapter); ena_sysctl_add_tuneables(adapter); ena_sysctl_add_irq_affinity(adapter); #ifndef RSS ena_sysctl_add_rss(adapter); #endif } static void ena_sysctl_add_wd(struct ena_adapter *adapter) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = adapter->pdev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); /* Sysctl calls for Watchdog service */ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "wd_active", CTLFLAG_RWTUN, &adapter->wd_active, 0, "Watchdog is active"); SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "keep_alive_timeout", CTLFLAG_RWTUN, &adapter->keep_alive_timeout, "Timeout for Keep Alive messages"); SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "missing_tx_timeout", CTLFLAG_RWTUN, &adapter->missing_tx_timeout, "Timeout for TX completion"); SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_max_queues", CTLFLAG_RWTUN, &adapter->missing_tx_max_queues, 0, "Number of TX queues to check per run"); SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_threshold", CTLFLAG_RWTUN, &adapter->missing_tx_threshold, 0, "Max number of timeouted packets"); } static void ena_sysctl_add_stats(struct ena_adapter *adapter) { device_t dev; struct ena_ring *tx_ring; struct ena_ring *rx_ring; struct ena_hw_stats *hw_stats; struct ena_stats_dev *dev_stats; struct ena_stats_tx *tx_stats; struct ena_stats_rx *rx_stats; struct ena_com_stats_admin *admin_stats; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; struct sysctl_oid *queue_node, *tx_node, *rx_node, *hw_node; struct sysctl_oid *admin_node; struct sysctl_oid_list *queue_list, *tx_list, *rx_list, *hw_list; struct sysctl_oid_list *admin_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; int i; dev = adapter->pdev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); tx_ring = adapter->tx_ring; rx_ring = adapter->rx_ring; hw_stats = &adapter->hw_stats; dev_stats = &adapter->dev_stats; admin_stats = &adapter->ena_dev->admin_queue.stats; SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "wd_expired", CTLFLAG_RD, &dev_stats->wd_expired, "Watchdog expiry count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_up", CTLFLAG_RD, &dev_stats->interface_up, "Network interface up count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_down", CTLFLAG_RD, &dev_stats->interface_down, "Network interface down count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "admin_q_pause", CTLFLAG_RD, &dev_stats->admin_q_pause, "Admin queue pauses"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "os_trigger", CTLFLAG_RD, &dev_stats->os_trigger, "OS trigger count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "missing_tx_cmpl", CTLFLAG_RD, &dev_stats->missing_tx_cmpl, "Missing TX completions resets count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "bad_rx_req_id", CTLFLAG_RD, &dev_stats->bad_rx_req_id, "Bad RX req id count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "bad_tx_req_id", CTLFLAG_RD, &dev_stats->bad_tx_req_id, "Bad TX req id count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "bad_rx_desc_num", CTLFLAG_RD, &dev_stats->bad_rx_desc_num, "Bad RX descriptors number count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "invalid_state", CTLFLAG_RD, &dev_stats->invalid_state, "Driver invalid state count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "missing_intr", CTLFLAG_RD, &dev_stats->missing_intr, "Missing interrupt count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "tx_desc_malformed", CTLFLAG_RD, &dev_stats->tx_desc_malformed, "TX descriptors malformed count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "rx_desc_malformed", CTLFLAG_RD, &dev_stats->rx_desc_malformed, "RX descriptors malformed count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "missing_admin_interrupt", CTLFLAG_RD, &dev_stats->missing_admin_interrupt, "Missing admin interrupts count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "admin_to", CTLFLAG_RD, &dev_stats->admin_to, "Admin queue timeouts count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "device_request_reset", CTLFLAG_RD, &dev_stats->device_request_reset, "Device reset requests count"); SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "total_resets", CTLFLAG_RD, &dev_stats->total_resets, "Total resets count"); for (i = 0; i < adapter->num_io_queues; ++i, ++tx_ring, ++rx_ring) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); adapter->que[i].oid = queue_node; #ifdef RSS /* Common stats */ SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "cpu", CTLFLAG_RD, &adapter->que[i].cpu, 0, "CPU affinity"); SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "domain", CTLFLAG_RD, &adapter->que[i].domain, 0, "NUMA domain"); #endif /* TX specific stats */ tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, "tx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX ring"); tx_list = SYSCTL_CHILDREN(tx_node); tx_stats = &tx_ring->tx_stats; SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "count", CTLFLAG_RD, &tx_stats->cnt, "Packets sent"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bytes", CTLFLAG_RD, &tx_stats->bytes, "Bytes sent"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "prepare_ctx_err", CTLFLAG_RD, &tx_stats->prepare_ctx_err, "TX buffer preparation failures"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "dma_mapping_err", CTLFLAG_RD, &tx_stats->dma_mapping_err, "DMA mapping failures"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "doorbells", CTLFLAG_RD, &tx_stats->doorbells, "Queue doorbells"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "missing_tx_comp", CTLFLAG_RD, &tx_stats->missing_tx_comp, "TX completions missed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bad_req_id", CTLFLAG_RD, &tx_stats->bad_req_id, "Bad request id count"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "mbuf_collapses", CTLFLAG_RD, &tx_stats->collapse, "Mbuf collapse count"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "mbuf_collapse_err", CTLFLAG_RD, &tx_stats->collapse_err, "Mbuf collapse failures"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_wakeups", CTLFLAG_RD, &tx_stats->queue_wakeup, "Queue wakeups"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_stops", CTLFLAG_RD, &tx_stats->queue_stop, "Queue stops"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "llq_buffer_copy", CTLFLAG_RD, &tx_stats->llq_buffer_copy, "Header copies for llq transaction"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "unmask_interrupt_num", CTLFLAG_RD, &tx_stats->unmask_interrupt_num, "Unmasked interrupt count"); /* RX specific stats */ rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, "rx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX ring"); rx_list = SYSCTL_CHILDREN(rx_node); rx_stats = &rx_ring->rx_stats; SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "count", CTLFLAG_RD, &rx_stats->cnt, "Packets received"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bytes", CTLFLAG_RD, &rx_stats->bytes, "Bytes received"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "refil_partial", CTLFLAG_RD, &rx_stats->refil_partial, "Partial refilled mbufs"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "csum_bad", CTLFLAG_RD, &rx_stats->csum_bad, "Bad RX checksum"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "mbuf_alloc_fail", CTLFLAG_RD, &rx_stats->mbuf_alloc_fail, "Failed mbuf allocs"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "mjum_alloc_fail", CTLFLAG_RD, &rx_stats->mjum_alloc_fail, "Failed jumbo mbuf allocs"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "dma_mapping_err", CTLFLAG_RD, &rx_stats->dma_mapping_err, "DMA mapping errors"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bad_desc_num", CTLFLAG_RD, &rx_stats->bad_desc_num, "Bad descriptor count"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bad_req_id", CTLFLAG_RD, &rx_stats->bad_req_id, "Bad request id count"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "empty_rx_ring", CTLFLAG_RD, &rx_stats->empty_rx_ring, "RX descriptors depletion count"); SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "csum_good", CTLFLAG_RD, &rx_stats->csum_good, "Valid RX checksum calculations"); } /* Stats read from device */ hw_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "hw_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics from hardware"); hw_list = SYSCTL_CHILDREN(hw_node); SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &hw_stats->rx_packets, "Packets received"); SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &hw_stats->tx_packets, "Packets transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &hw_stats->rx_bytes, "Bytes received"); SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_bytes", CTLFLAG_RD, &hw_stats->tx_bytes, "Bytes transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_drops", CTLFLAG_RD, &hw_stats->rx_drops, "Receive packet drops"); SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_drops", CTLFLAG_RD, &hw_stats->tx_drops, "Transmit packet drops"); /* ENA Admin queue stats */ admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "admin_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA Admin Queue statistics"); admin_list = SYSCTL_CHILDREN(admin_node); SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "aborted_cmd", CTLFLAG_RD, &admin_stats->aborted_cmd, 0, "Aborted commands"); SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "sumbitted_cmd", CTLFLAG_RD, &admin_stats->submitted_cmd, 0, "Submitted commands"); SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "completed_cmd", CTLFLAG_RD, &admin_stats->completed_cmd, 0, "Completed commands"); SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "out_of_space", CTLFLAG_RD, &admin_stats->out_of_space, 0, "Queue out of space"); SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "no_completion", CTLFLAG_RD, &admin_stats->no_completion, 0, "Commands not completed"); } static void ena_sysctl_add_srd_info(struct ena_adapter *adapter) { device_t dev; struct sysctl_oid *ena_srd_info; struct sysctl_oid_list *srd_list; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; struct ena_admin_ena_srd_stats *srd_stats_ptr; struct ena_srd_metrics cur_stat_strings; int i; dev = adapter->pdev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); ena_srd_info = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "ena_srd_info", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA's SRD information"); srd_list = SYSCTL_CHILDREN(ena_srd_info); SYSCTL_ADD_U64(ctx, srd_list, OID_AUTO, "ena_srd_mode", CTLFLAG_RD, &adapter->ena_srd_info.flags, 0, "Describes which ENA-express features are enabled"); srd_stats_ptr = &adapter->ena_srd_info.ena_srd_stats; for (i = 0 ; i < ENA_SRD_METRICS_ARRAY_SIZE; i++) { cur_stat_strings = ena_srd_stats_strings[i]; SYSCTL_ADD_U64(ctx, srd_list, OID_AUTO, cur_stat_strings.name, CTLFLAG_RD, (u64 *)srd_stats_ptr + cur_stat_strings.stat_offset, 0, cur_stat_strings.description); } } static void ena_sysctl_add_customer_metrics(struct ena_adapter *adapter) { device_t dev; struct ena_com_dev *ena_dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; struct sysctl_oid *customer_metric; struct sysctl_oid_list *customer_list; int i; dev = adapter->pdev; ena_dev = adapter->ena_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); customer_metric = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "customer_metrics", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA's customer metrics"); customer_list = SYSCTL_CHILDREN(customer_metric); for (i = 0; i < ENA_CUSTOMER_METRICS_ARRAY_SIZE; i++) { if (ena_com_get_customer_metric_support(ena_dev, i)) { SYSCTL_ADD_U64(ctx, customer_list, OID_AUTO, ena_hw_stats_strings[i].name, CTLFLAG_RD, &adapter->customer_metrics_array[i], 0, ena_hw_stats_strings[i].description); } } } static void ena_sysctl_add_eni_metrics(struct ena_adapter *adapter) { device_t dev; struct ena_admin_eni_stats *eni_metrics; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; struct sysctl_oid *eni_node; struct sysctl_oid_list *eni_list; dev = adapter->pdev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); eni_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "eni_metrics", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA's ENI metrics"); eni_list = SYSCTL_CHILDREN(eni_node); eni_metrics = &adapter->eni_metrics; SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_in_allowance_exceeded", CTLFLAG_RD, &eni_metrics->bw_in_allowance_exceeded, 0, "Inbound BW allowance exceeded"); SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_out_allowance_exceeded", CTLFLAG_RD, &eni_metrics->bw_out_allowance_exceeded, 0, "Outbound BW allowance exceeded"); SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "pps_allowance_exceeded", CTLFLAG_RD, &eni_metrics->pps_allowance_exceeded, 0, "PPS allowance exceeded"); SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "conntrack_allowance_exceeded", CTLFLAG_RD, &eni_metrics->conntrack_allowance_exceeded, 0, "Connection tracking allowance exceeded"); SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "linklocal_allowance_exceeded", CTLFLAG_RD, &eni_metrics->linklocal_allowance_exceeded, 0, "Linklocal packet rate allowance exceeded"); } static void ena_sysctl_add_tuneables(struct ena_adapter *adapter) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = adapter->pdev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); /* Tuneable number of buffers in the buf-ring (drbr) */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "buf_ring_size", CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_buf_ring_size, "I", "Size of the Tx buffer ring (drbr)."); /* Tuneable number of the Rx ring size */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_queue_size", CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_rx_queue_size, "I", "Size of the Rx ring. The size should be a power of 2."); /* Tuneable number of IO queues */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "io_queues_nb", CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_io_queues_nb, "I", "Number of IO queues."); /* * Tuneable, which determines how often ENA metrics will be read. * 0 means it's turned off. Maximum allowed value is limited by: * ENA_METRICS_MAX_SAMPLE_INTERVAL. */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "stats_sample_interval", CTLTYPE_U16 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_metrics_interval, "SU", "Interval in seconds for updating Netword interface metrics. 0 turns off the update."); } /* Kernel option RSS prevents manipulation of key hash and indirection table. */ #ifndef RSS static void ena_sysctl_add_rss(struct ena_adapter *adapter) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = adapter->pdev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); /* RSS options */ tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rss", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Receive Side Scaling options."); child = SYSCTL_CHILDREN(tree); /* RSS hash key */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "key", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_rss_key, "A", "RSS key."); /* Tuneable RSS indirection table */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "indir_table", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_rss_indir_table, "A", "RSS indirection table."); /* RSS indirection table size */ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "indir_table_size", CTLFLAG_RD | CTLFLAG_MPSAFE, &ena_rss_table_size, 0, "RSS indirection table size."); } #endif /* RSS */ static void ena_sysctl_add_irq_affinity(struct ena_adapter *adapter) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = adapter->pdev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "irq_affinity", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Decide base CPU and stride for irqs affinity."); child = SYSCTL_CHILDREN(tree); /* Add base cpu leaf */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "base_cpu", CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_irq_base_cpu, "I", "Base cpu index for setting irq affinity."); /* Add cpu stride leaf */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "cpu_stride", CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, ena_sysctl_irq_cpu_stride, "I", "Distance between irqs when setting affinity."); } /* * ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes. * * Whether the nodes are registered or unregistered depends on a delta between * the `old` and `new` parameters, representing the number of queues. * * This function is used to hide sysctl attributes for queue nodes which aren't * currently used by the HW (e.g. after a call to `ena_sysctl_io_queues_nb`). * * NOTE: * All unregistered nodes must be registered again at detach, i.e. by a call to * this function. */ void ena_sysctl_update_queue_node_nb(struct ena_adapter *adapter, int old, int new) { struct sysctl_oid *oid; int min, max, i; min = MIN(old, new); max = MIN(MAX(old, new), adapter->max_num_io_queues); for (i = min; i < max; ++i) { oid = adapter->que[i].oid; sysctl_wlock(); if (old > new) sysctl_unregister_oid(oid); else sysctl_register_oid(oid); sysctl_wunlock(); } } static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS) { struct ena_adapter *adapter = arg1; uint32_t val; int error; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = EINVAL; goto unlock; } val = 0; error = sysctl_wire_old_buffer(req, sizeof(val)); if (error == 0) { val = adapter->buf_ring_size; error = sysctl_handle_32(oidp, &val, 0, req); } if (error != 0 || req->newptr == NULL) goto unlock; if (!powerof2(val) || val == 0) { ena_log(adapter->pdev, ERR, "Requested new Tx buffer ring size (%u) is not a power of 2\n", val); error = EINVAL; goto unlock; } if (val != adapter->buf_ring_size) { ena_log(adapter->pdev, INFO, "Requested new Tx buffer ring size: %d. Old size: %d\n", val, adapter->buf_ring_size); error = ena_update_buf_ring_size(adapter, val); } else { ena_log(adapter->pdev, ERR, "New Tx buffer ring size is the same as already used: %u\n", adapter->buf_ring_size); } unlock: ENA_LOCK_UNLOCK(); return (error); } static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS) { struct ena_adapter *adapter = arg1; uint32_t val; int error; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = EINVAL; goto unlock; } val = 0; error = sysctl_wire_old_buffer(req, sizeof(val)); if (error == 0) { val = adapter->requested_rx_ring_size; error = sysctl_handle_32(oidp, &val, 0, req); } if (error != 0 || req->newptr == NULL) goto unlock; if (val < ENA_MIN_RING_SIZE || val > adapter->max_rx_ring_size) { ena_log(adapter->pdev, ERR, "Requested new Rx queue size (%u) is out of range: [%u, %u]\n", val, ENA_MIN_RING_SIZE, adapter->max_rx_ring_size); error = EINVAL; goto unlock; } /* Check if the parameter is power of 2 */ if (!powerof2(val)) { ena_log(adapter->pdev, ERR, "Requested new Rx queue size (%u) is not a power of 2\n", val); error = EINVAL; goto unlock; } if (val != adapter->requested_rx_ring_size) { ena_log(adapter->pdev, INFO, "Requested new Rx queue size: %u. Old size: %u\n", val, adapter->requested_rx_ring_size); error = ena_update_queue_size(adapter, adapter->requested_tx_ring_size, val); } else { ena_log(adapter->pdev, ERR, "New Rx queue size is the same as already used: %u\n", adapter->requested_rx_ring_size); } unlock: ENA_LOCK_UNLOCK(); return (error); } /* * Change number of effectively used IO queues adapter->num_io_queues */ static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS) { struct ena_adapter *adapter = arg1; uint32_t old_num_queues, tmp = 0; int error; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = EINVAL; goto unlock; } error = sysctl_wire_old_buffer(req, sizeof(tmp)); if (error == 0) { tmp = adapter->num_io_queues; error = sysctl_handle_int(oidp, &tmp, 0, req); } if (error != 0 || req->newptr == NULL) goto unlock; if (tmp == 0) { ena_log(adapter->pdev, ERR, "Requested number of IO queues is zero\n"); error = EINVAL; goto unlock; } /* * The adapter::max_num_io_queues is the HW capability. The system * resources availability may potentially be a tighter limit. Therefore * the relation `adapter::max_num_io_queues >= adapter::msix_vecs` * always holds true, while the `adapter::msix_vecs` is variable across * device reset (`ena_destroy_device()` + `ena_restore_device()`). */ if (tmp > (adapter->msix_vecs - ENA_ADMIN_MSIX_VEC)) { ena_log(adapter->pdev, ERR, "Requested number of IO queues is higher than maximum allowed (%u)\n", adapter->msix_vecs - ENA_ADMIN_MSIX_VEC); error = EINVAL; goto unlock; } if (tmp == adapter->num_io_queues) { ena_log(adapter->pdev, ERR, "Requested number of IO queues is equal to current value " "(%u)\n", adapter->num_io_queues); } else { ena_log(adapter->pdev, INFO, "Requested new number of IO queues: %u, current value: " "%u\n", tmp, adapter->num_io_queues); old_num_queues = adapter->num_io_queues; error = ena_update_io_queue_nb(adapter, tmp); if (error != 0) return (error); ena_sysctl_update_queue_node_nb(adapter, old_num_queues, tmp); } unlock: ENA_LOCK_UNLOCK(); return (error); } static int ena_sysctl_metrics_interval(SYSCTL_HANDLER_ARGS) { struct ena_adapter *adapter = arg1; uint16_t interval; int error; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = EINVAL; goto unlock; } error = sysctl_wire_old_buffer(req, sizeof(interval)); if (error == 0) { interval = adapter->metrics_sample_interval; error = sysctl_handle_16(oidp, &interval, 0, req); } if (error != 0 || req->newptr == NULL) goto unlock; if (interval > ENA_METRICS_MAX_SAMPLE_INTERVAL) { ena_log(adapter->pdev, ERR, "ENA metrics update interval is out of range - maximum allowed value: %d seconds\n", ENA_METRICS_MAX_SAMPLE_INTERVAL); error = EINVAL; goto unlock; } if (interval == 0) { ena_log(adapter->pdev, INFO, "ENA metrics update is now turned off\n"); bzero(&adapter->eni_metrics, sizeof(adapter->eni_metrics)); } else { ena_log(adapter->pdev, INFO, "ENA metrics update interval is set to: %" PRIu16 " seconds\n", interval); } adapter->metrics_sample_interval = interval; unlock: ENA_LOCK_UNLOCK(); return (0); } static int ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS) { struct ena_adapter *adapter = arg1; int irq_base_cpu = 0; int error; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = ENODEV; goto unlock; } error = sysctl_wire_old_buffer(req, sizeof(irq_base_cpu)); if (error == 0) { irq_base_cpu = adapter->irq_cpu_base; error = sysctl_handle_int(oidp, &irq_base_cpu, 0, req); } if (error != 0 || req->newptr == NULL) goto unlock; if (irq_base_cpu <= ENA_BASE_CPU_UNSPECIFIED) { ena_log(adapter->pdev, ERR, "Requested base CPU is less than zero.\n"); error = EINVAL; goto unlock; } if (irq_base_cpu > mp_ncpus) { ena_log(adapter->pdev, INFO, "Requested base CPU is larger than the number of available CPUs. \n"); error = EINVAL; goto unlock; } if (irq_base_cpu == adapter->irq_cpu_base) { ena_log(adapter->pdev, INFO, "Requested IRQ base CPU is equal to current value " "(%d)\n", adapter->irq_cpu_base); goto unlock; } ena_log(adapter->pdev, INFO, "Requested new IRQ base CPU: %d, current value: %d\n", irq_base_cpu, adapter->irq_cpu_base); error = ena_update_base_cpu(adapter, irq_base_cpu); unlock: ENA_LOCK_UNLOCK(); return (error); } static int ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS) { struct ena_adapter *adapter = arg1; int32_t irq_cpu_stride = 0; int error; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = ENODEV; goto unlock; } error = sysctl_wire_old_buffer(req, sizeof(irq_cpu_stride)); if (error == 0) { irq_cpu_stride = adapter->irq_cpu_stride; error = sysctl_handle_int(oidp, &irq_cpu_stride, 0, req); } if (error != 0 || req->newptr == NULL) goto unlock; if (irq_cpu_stride < 0) { ena_log(adapter->pdev, ERR, "Requested IRQ stride is less than zero.\n"); error = EINVAL; goto unlock; } if (irq_cpu_stride > mp_ncpus) { ena_log(adapter->pdev, INFO, "Warning: Requested IRQ stride is larger than the number of available CPUs.\n"); } if (irq_cpu_stride == adapter->irq_cpu_stride) { ena_log(adapter->pdev, INFO, "Requested IRQ CPU stride is equal to current value " "(%u)\n", adapter->irq_cpu_stride); goto unlock; } ena_log(adapter->pdev, INFO, "Requested new IRQ CPU stride: %u, current value: %u\n", irq_cpu_stride, adapter->irq_cpu_stride); error = ena_update_cpu_stride(adapter, irq_cpu_stride); if (error != 0) goto unlock; unlock: ENA_LOCK_UNLOCK(); return (error); } #ifndef RSS /* * Change the Receive Side Scaling hash key. */ static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS) { struct ena_adapter *adapter = arg1; struct ena_com_dev *ena_dev = adapter->ena_dev; enum ena_admin_hash_functions ena_func; char msg[ENA_HASH_KEY_MSG_SIZE]; char elem[3] = { 0 }; char *endp; u8 rss_key[ENA_HASH_KEY_SIZE]; int error, i; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = EINVAL; goto unlock; } if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) { error = ENOTSUP; goto unlock; } error = sysctl_wire_old_buffer(req, sizeof(msg)); if (error != 0) goto unlock; error = ena_com_get_hash_function(adapter->ena_dev, &ena_func); if (error != 0) { device_printf(adapter->pdev, "Cannot get hash function\n"); goto unlock; } if (ena_func != ENA_ADMIN_TOEPLITZ) { error = EINVAL; device_printf(adapter->pdev, "Unsupported hash algorithm\n"); goto unlock; } error = ena_rss_get_hash_key(ena_dev, rss_key); if (error != 0) { device_printf(adapter->pdev, "Cannot get hash key\n"); goto unlock; } for (i = 0; i < ENA_HASH_KEY_SIZE; ++i) snprintf(&msg[i * 2], 3, "%02x", rss_key[i]); error = sysctl_handle_string(oidp, msg, sizeof(msg), req); if (error != 0 || req->newptr == NULL) goto unlock; if (strlen(msg) != sizeof(msg) - 1) { error = EINVAL; device_printf(adapter->pdev, "Invalid key size\n"); goto unlock; } for (i = 0; i < ENA_HASH_KEY_SIZE; ++i) { strncpy(elem, &msg[i * 2], 2); rss_key[i] = strtol(elem, &endp, 16); /* Both hex nibbles in the string must be valid to continue. */ if (endp == elem || *endp != '\0' || rss_key[i] < 0) { error = EINVAL; device_printf(adapter->pdev, "Invalid key hex value: '%c'\n", *endp); goto unlock; } } error = ena_rss_set_hash(ena_dev, rss_key); if (error != 0) device_printf(adapter->pdev, "Cannot fill hash key\n"); unlock: ENA_LOCK_UNLOCK(); return (error); } /* * Change the Receive Side Scaling indirection table. * * The sysctl entry string consists of one or more `x:y` keypairs, where * x stands for the table index and y for its new value. * Table indices that don't need to be updated can be omitted from the string * and will retain their existing values. If an index is entered more than once, * the last value is used. * * Example: * To update two selected indices in the RSS indirection table, e.g. setting * index 0 to queue 5 and then index 5 to queue 0, the below command should be * used: * sysctl dev.ena.0.rss.indir_table="0:5 5:0" */ static int ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS) { int num_queues, error; struct ena_adapter *adapter = arg1; struct ena_indir *indir; char *msg, *buf, *endp; uint32_t idx, value; ENA_LOCK_LOCK(); if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { error = EINVAL; goto unlock; } if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) { error = ENOTSUP; goto unlock; } indir = adapter->rss_indir; msg = indir->sysctl_buf; if (unlikely(indir == NULL)) { error = ENOTSUP; goto unlock; } error = sysctl_handle_string(oidp, msg, sizeof(indir->sysctl_buf), req); if (error != 0 || req->newptr == NULL) goto unlock; num_queues = adapter->num_io_queues; /* * This sysctl expects msg to be a list of `x:y` record pairs, * where x is the indirection table index and y is its value. */ for (buf = msg; *buf != '\0'; buf = endp) { idx = strtol(buf, &endp, 10); if (endp == buf || idx < 0) { device_printf(adapter->pdev, "Invalid index: %s\n", buf); error = EINVAL; break; } if (idx >= ENA_RX_RSS_TABLE_SIZE) { device_printf(adapter->pdev, "Index %d out of range\n", idx); error = ERANGE; break; } buf = endp; if (*buf++ != ':') { device_printf(adapter->pdev, "Missing ':' separator\n"); error = EINVAL; break; } value = strtol(buf, &endp, 10); if (endp == buf || value < 0) { device_printf(adapter->pdev, "Invalid value: %s\n", buf); error = EINVAL; break; } if (value >= num_queues) { device_printf(adapter->pdev, "Value %d out of range\n", value); error = ERANGE; break; } indir->table[idx] = value; } if (error != 0) /* Reload indirection table with last good data. */ ena_rss_indir_get(adapter, indir->table); /* At this point msg has been clobbered by sysctl_handle_string. */ ena_rss_copy_indir_buf(msg, indir->table); if (error == 0) error = ena_rss_indir_set(adapter, indir->table); unlock: ENA_LOCK_UNLOCK(); return (error); } #endif /* RSS */