/* * Copyright (C) 2016-2018 Vincenzo Maffione * Copyright (C) 2015 Stefano Garzarella * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * common headers */ #if defined(__FreeBSD__) #include #include #include #include #include #include #include #include #include #define usleep_range(_1, _2) \ pause_sbt("sync-kloop-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE) #elif defined(linux) #include #include #include #endif #include #include #include #include /* Support for eventfd-based notifications. */ #if defined(linux) #define SYNC_KLOOP_POLL #endif /* Write kring pointers (hwcur, hwtail) to the CSB. * This routine is coupled with ptnetmap_guest_read_kring_csb(). */ static inline void sync_kloop_kernel_write(struct nm_csb_ktoa __user *ptr, uint32_t hwcur, uint32_t hwtail) { /* Issue a first store-store barrier to make sure writes to the * netmap ring do not overcome updates on ktoa->hwcur and ktoa->hwtail. */ nm_stst_barrier(); /* * The same scheme used in nm_sync_kloop_appl_write() applies here. * We allow the application to read a value of hwcur more recent than the value * of hwtail, since this would anyway result in a consistent view of the * ring state (and hwcur can never wraparound hwtail, since hwcur must be * behind head). * * The following memory barrier scheme is used to make this happen: * * Application Kernel * * STORE(hwcur) LOAD(hwtail) * wmb() <-------------> rmb() * STORE(hwtail) LOAD(hwcur) */ CSB_WRITE(ptr, hwcur, hwcur); nm_stst_barrier(); CSB_WRITE(ptr, hwtail, hwtail); } /* Read kring pointers (head, cur, sync_flags) from the CSB. * This routine is coupled with ptnetmap_guest_write_kring_csb(). */ static inline void sync_kloop_kernel_read(struct nm_csb_atok __user *ptr, struct netmap_ring *shadow_ring, uint32_t num_slots) { /* * We place a memory barrier to make sure that the update of head never * overtakes the update of cur. * (see explanation in sync_kloop_kernel_write). */ CSB_READ(ptr, head, shadow_ring->head); nm_ldld_barrier(); CSB_READ(ptr, cur, shadow_ring->cur); CSB_READ(ptr, sync_flags, shadow_ring->flags); /* Make sure that loads from atok->head and atok->cur are not delayed * after the loads from the netmap ring. */ nm_ldld_barrier(); } /* Enable or disable application --> kernel kicks. */ static inline void csb_ktoa_kick_enable(struct nm_csb_ktoa __user *csb_ktoa, uint32_t val) { CSB_WRITE(csb_ktoa, kern_need_kick, val); } #ifdef SYNC_KLOOP_POLL /* Are application interrupt enabled or disabled? */ static inline uint32_t csb_atok_intr_enabled(struct nm_csb_atok __user *csb_atok) { uint32_t v; CSB_READ(csb_atok, appl_need_kick, v); return v; } #endif /* SYNC_KLOOP_POLL */ static inline void sync_kloop_kring_dump(const char *title, const struct netmap_kring *kring) { nm_prinf("%s, kring %s, hwcur %d, rhead %d, " "rcur %d, rtail %d, hwtail %d", title, kring->name, kring->nr_hwcur, kring->rhead, kring->rcur, kring->rtail, kring->nr_hwtail); } /* Arguments for netmap_sync_kloop_tx_ring() and * netmap_sync_kloop_rx_ring(). */ struct sync_kloop_ring_args { struct netmap_kring *kring; struct nm_csb_atok *csb_atok; struct nm_csb_ktoa *csb_ktoa; #ifdef SYNC_KLOOP_POLL struct eventfd_ctx *irq_ctx; #endif /* SYNC_KLOOP_POLL */ /* Are we busy waiting rather than using a schedule() loop ? */ bool busy_wait; /* Are we processing in the context of VM exit ? */ bool direct; }; static void netmap_sync_kloop_tx_ring(const struct sync_kloop_ring_args *a) { struct netmap_kring *kring = a->kring; struct nm_csb_atok *csb_atok = a->csb_atok; struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa; struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */ #ifdef SYNC_KLOOP_POLL bool more_txspace = false; #endif /* SYNC_KLOOP_POLL */ uint32_t num_slots; int batch; if (unlikely(nm_kr_tryget(kring, 1, NULL))) { return; } num_slots = kring->nkr_num_slots; /* Disable application --> kernel notifications. */ if (!a->direct) { csb_ktoa_kick_enable(csb_ktoa, 0); } /* Copy the application kring pointers from the CSB */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); for (;;) { batch = shadow_ring.head - kring->nr_hwcur; if (batch < 0) batch += num_slots; #ifdef PTN_TX_BATCH_LIM if (batch > PTN_TX_BATCH_LIM(num_slots)) { /* If application moves ahead too fast, let's cut the move so * that we don't exceed our batch limit. */ uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots); if (head_lim >= num_slots) head_lim -= num_slots; nm_prdis(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head, head_lim); shadow_ring.head = head_lim; batch = PTN_TX_BATCH_LIM(num_slots); } #endif /* PTN_TX_BATCH_LIM */ if (nm_kr_txspace(kring) <= (num_slots >> 1)) { shadow_ring.flags |= NAF_FORCE_RECLAIM; } /* Netmap prologue */ shadow_ring.tail = kring->rtail; if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) { /* Reinit ring and enable notifications. */ netmap_ring_reinit(kring); if (!a->busy_wait) { csb_ktoa_kick_enable(csb_ktoa, 1); } break; } if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) { sync_kloop_kring_dump("pre txsync", kring); } if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) { if (!a->busy_wait) { /* Re-enable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); } nm_prerr("txsync() failed"); break; } /* * Finalize * Copy kernel hwcur and hwtail into the CSB for the application sync(), and * do the nm_sync_finalize. */ sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, kring->nr_hwtail); if (kring->rtail != kring->nr_hwtail) { /* Some more room available in the parent adapter. */ kring->rtail = kring->nr_hwtail; #ifdef SYNC_KLOOP_POLL more_txspace = true; #endif /* SYNC_KLOOP_POLL */ } if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) { sync_kloop_kring_dump("post txsync", kring); } /* Interrupt the application if needed. */ #ifdef SYNC_KLOOP_POLL if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) { /* We could disable kernel --> application kicks here, * to avoid spurious interrupts. */ eventfd_signal(a->irq_ctx, 1); more_txspace = false; } #endif /* SYNC_KLOOP_POLL */ /* Read CSB to see if there is more work to do. */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (shadow_ring.head == kring->rhead) { if (a->busy_wait) { break; } /* * No more packets to transmit. We enable notifications and * go to sleep, waiting for a kick from the application when new * new slots are ready for transmission. */ /* Re-enable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); /* Double check, with store-load memory barrier. */ nm_stld_barrier(); sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (shadow_ring.head != kring->rhead) { /* We won the race condition, there are more packets to * transmit. Disable notifications and do another cycle */ csb_ktoa_kick_enable(csb_ktoa, 0); continue; } break; } if (nm_kr_txempty(kring)) { /* No more available TX slots. We stop waiting for a notification * from the backend (netmap_tx_irq). */ nm_prdis(1, "TX ring"); break; } } nm_kr_put(kring); #ifdef SYNC_KLOOP_POLL if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) { eventfd_signal(a->irq_ctx, 1); } #endif /* SYNC_KLOOP_POLL */ } /* RX cycle without receive any packets */ #define SYNC_LOOP_RX_DRY_CYCLES_MAX 2 static inline int sync_kloop_norxslots(struct netmap_kring *kring, uint32_t g_head) { return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head, kring->nkr_num_slots - 1)); } static void netmap_sync_kloop_rx_ring(const struct sync_kloop_ring_args *a) { struct netmap_kring *kring = a->kring; struct nm_csb_atok *csb_atok = a->csb_atok; struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa; struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */ int dry_cycles = 0; #ifdef SYNC_KLOOP_POLL bool some_recvd = false; #endif /* SYNC_KLOOP_POLL */ uint32_t num_slots; if (unlikely(nm_kr_tryget(kring, 1, NULL))) { return; } num_slots = kring->nkr_num_slots; /* Get RX csb_atok and csb_ktoa pointers from the CSB. */ num_slots = kring->nkr_num_slots; /* Disable notifications. */ if (!a->direct) { csb_ktoa_kick_enable(csb_ktoa, 0); } /* Copy the application kring pointers from the CSB */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); for (;;) { uint32_t hwtail; /* Netmap prologue */ shadow_ring.tail = kring->rtail; if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) { /* Reinit ring and enable notifications. */ netmap_ring_reinit(kring); if (!a->busy_wait) { csb_ktoa_kick_enable(csb_ktoa, 1); } break; } if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) { sync_kloop_kring_dump("pre rxsync", kring); } if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) { if (!a->busy_wait) { /* Re-enable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); } nm_prerr("rxsync() failed"); break; } /* * Finalize * Copy kernel hwcur and hwtail into the CSB for the application sync() */ hwtail = NM_ACCESS_ONCE(kring->nr_hwtail); sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, hwtail); if (kring->rtail != hwtail) { kring->rtail = hwtail; #ifdef SYNC_KLOOP_POLL some_recvd = true; #endif /* SYNC_KLOOP_POLL */ dry_cycles = 0; } else { dry_cycles++; } if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) { sync_kloop_kring_dump("post rxsync", kring); } #ifdef SYNC_KLOOP_POLL /* Interrupt the application if needed. */ if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) { /* We could disable kernel --> application kicks here, * to avoid spurious interrupts. */ eventfd_signal(a->irq_ctx, 1); some_recvd = false; } #endif /* SYNC_KLOOP_POLL */ /* Read CSB to see if there is more work to do. */ sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (sync_kloop_norxslots(kring, shadow_ring.head)) { if (a->busy_wait) { break; } /* * No more slots available for reception. We enable notification and * go to sleep, waiting for a kick from the application when new receive * slots are available. */ /* Re-enable notifications. */ csb_ktoa_kick_enable(csb_ktoa, 1); /* Double check, with store-load memory barrier. */ nm_stld_barrier(); sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots); if (!sync_kloop_norxslots(kring, shadow_ring.head)) { /* We won the race condition, more slots are available. Disable * notifications and do another cycle. */ csb_ktoa_kick_enable(csb_ktoa, 0); continue; } break; } hwtail = NM_ACCESS_ONCE(kring->nr_hwtail); if (unlikely(hwtail == kring->rhead || dry_cycles >= SYNC_LOOP_RX_DRY_CYCLES_MAX)) { /* No more packets to be read from the backend. We stop and * wait for a notification from the backend (netmap_rx_irq). */ nm_prdis(1, "nr_hwtail: %d rhead: %d dry_cycles: %d", hwtail, kring->rhead, dry_cycles); break; } } nm_kr_put(kring); #ifdef SYNC_KLOOP_POLL /* Interrupt the application if needed. */ if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) { eventfd_signal(a->irq_ctx, 1); } #endif /* SYNC_KLOOP_POLL */ } #ifdef SYNC_KLOOP_POLL struct sync_kloop_poll_ctx; struct sync_kloop_poll_entry { /* Support for receiving notifications from * a netmap ring or from the application. */ struct file *filp; wait_queue_t wait; wait_queue_head_t *wqh; /* Support for sending notifications to the application. */ struct eventfd_ctx *irq_ctx; struct file *irq_filp; /* Arguments for the ring processing function. Useful * in case of custom wake-up function. */ struct sync_kloop_ring_args *args; struct sync_kloop_poll_ctx *parent; }; struct sync_kloop_poll_ctx { poll_table wait_table; unsigned int next_entry; int (*next_wake_fun)(wait_queue_t *, unsigned, int, void *); unsigned int num_entries; unsigned int num_tx_rings; unsigned int num_rings; /* First num_tx_rings entries are for the TX kicks. * Then the RX kicks entries follow. The last two * entries are for TX irq, and RX irq. */ struct sync_kloop_poll_entry entries[0]; }; static void sync_kloop_poll_table_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct sync_kloop_poll_ctx *poll_ctx = container_of(pt, struct sync_kloop_poll_ctx, wait_table); struct sync_kloop_poll_entry *entry = poll_ctx->entries + poll_ctx->next_entry; BUG_ON(poll_ctx->next_entry >= poll_ctx->num_entries); entry->wqh = wqh; entry->filp = file; /* Use the default wake up function. */ if (poll_ctx->next_wake_fun == NULL) { init_waitqueue_entry(&entry->wait, current); } else { init_waitqueue_func_entry(&entry->wait, poll_ctx->next_wake_fun); } add_wait_queue(wqh, &entry->wait); } static int sync_kloop_tx_kick_wake_fun(wait_queue_t *wait, unsigned mode, int wake_flags, void *key) { struct sync_kloop_poll_entry *entry = container_of(wait, struct sync_kloop_poll_entry, wait); netmap_sync_kloop_tx_ring(entry->args); return 0; } static int sync_kloop_tx_irq_wake_fun(wait_queue_t *wait, unsigned mode, int wake_flags, void *key) { struct sync_kloop_poll_entry *entry = container_of(wait, struct sync_kloop_poll_entry, wait); struct sync_kloop_poll_ctx *poll_ctx = entry->parent; int i; for (i = 0; i < poll_ctx->num_tx_rings; i++) { struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx; if (irq_ctx) { eventfd_signal(irq_ctx, 1); } } return 0; } static int sync_kloop_rx_kick_wake_fun(wait_queue_t *wait, unsigned mode, int wake_flags, void *key) { struct sync_kloop_poll_entry *entry = container_of(wait, struct sync_kloop_poll_entry, wait); netmap_sync_kloop_rx_ring(entry->args); return 0; } static int sync_kloop_rx_irq_wake_fun(wait_queue_t *wait, unsigned mode, int wake_flags, void *key) { struct sync_kloop_poll_entry *entry = container_of(wait, struct sync_kloop_poll_entry, wait); struct sync_kloop_poll_ctx *poll_ctx = entry->parent; int i; for (i = poll_ctx->num_tx_rings; i < poll_ctx->num_rings; i++) { struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx; if (irq_ctx) { eventfd_signal(irq_ctx, 1); } } return 0; } #endif /* SYNC_KLOOP_POLL */ int netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr) { struct nmreq_sync_kloop_start *req = (struct nmreq_sync_kloop_start *)(uintptr_t)hdr->nr_body; struct nmreq_opt_sync_kloop_eventfds *eventfds_opt = NULL; #ifdef SYNC_KLOOP_POLL struct sync_kloop_poll_ctx *poll_ctx = NULL; #endif /* SYNC_KLOOP_POLL */ int num_rx_rings, num_tx_rings, num_rings; struct sync_kloop_ring_args *args = NULL; uint32_t sleep_us = req->sleep_us; struct nm_csb_atok* csb_atok_base; struct nm_csb_ktoa* csb_ktoa_base; struct netmap_adapter *na; struct nmreq_option *opt; bool na_could_sleep = false; bool busy_wait = true; bool direct_tx = false; bool direct_rx = false; int err = 0; int i; if (sleep_us > 1000000) { /* We do not accept sleeping for more than a second. */ return EINVAL; } if (priv->np_nifp == NULL) { return ENXIO; } mb(); /* make sure following reads are not from cache */ na = priv->np_na; if (!nm_netmap_on(na)) { return ENXIO; } NMG_LOCK(); /* Make sure the application is working in CSB mode. */ if (!priv->np_csb_atok_base || !priv->np_csb_ktoa_base) { NMG_UNLOCK(); nm_prerr("sync-kloop on %s requires " "NETMAP_REQ_OPT_CSB option", na->name); return EINVAL; } csb_atok_base = priv->np_csb_atok_base; csb_ktoa_base = priv->np_csb_ktoa_base; /* Make sure that no kloop is currently running. */ if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) { err = EBUSY; } priv->np_kloop_state |= NM_SYNC_KLOOP_RUNNING; NMG_UNLOCK(); if (err) { return err; } num_rx_rings = priv->np_qlast[NR_RX] - priv->np_qfirst[NR_RX]; num_tx_rings = priv->np_qlast[NR_TX] - priv->np_qfirst[NR_TX]; num_rings = num_tx_rings + num_rx_rings; args = nm_os_malloc(num_rings * sizeof(args[0])); if (!args) { err = ENOMEM; goto out; } /* Prepare the arguments for netmap_sync_kloop_tx_ring() * and netmap_sync_kloop_rx_ring(). */ for (i = 0; i < num_tx_rings; i++) { struct sync_kloop_ring_args *a = args + i; a->kring = NMR(na, NR_TX)[i + priv->np_qfirst[NR_TX]]; a->csb_atok = csb_atok_base + i; a->csb_ktoa = csb_ktoa_base + i; a->busy_wait = busy_wait; a->direct = direct_tx; } for (i = 0; i < num_rx_rings; i++) { struct sync_kloop_ring_args *a = args + num_tx_rings + i; a->kring = NMR(na, NR_RX)[i + priv->np_qfirst[NR_RX]]; a->csb_atok = csb_atok_base + num_tx_rings + i; a->csb_ktoa = csb_ktoa_base + num_tx_rings + i; a->busy_wait = busy_wait; a->direct = direct_rx; } /* Validate notification options. */ opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_MODE); if (opt != NULL) { struct nmreq_opt_sync_kloop_mode *mode_opt = (struct nmreq_opt_sync_kloop_mode *)opt; direct_tx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_TX); direct_rx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_RX); if (mode_opt->mode & ~(NM_OPT_SYNC_KLOOP_DIRECT_TX | NM_OPT_SYNC_KLOOP_DIRECT_RX)) { opt->nro_status = err = EINVAL; goto out; } opt->nro_status = 0; } opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS); if (opt != NULL) { if (opt->nro_size != sizeof(*eventfds_opt) + sizeof(eventfds_opt->eventfds[0]) * num_rings) { /* Option size not consistent with the number of * entries. */ opt->nro_status = err = EINVAL; goto out; } #ifdef SYNC_KLOOP_POLL eventfds_opt = (struct nmreq_opt_sync_kloop_eventfds *)opt; opt->nro_status = 0; /* Check if some ioeventfd entry is not defined, and force sleep * synchronization in that case. */ busy_wait = false; for (i = 0; i < num_rings; i++) { if (eventfds_opt->eventfds[i].ioeventfd < 0) { busy_wait = true; break; } } if (busy_wait && (direct_tx || direct_rx)) { /* For direct processing we need all the * ioeventfds to be valid. */ opt->nro_status = err = EINVAL; goto out; } /* We need 2 poll entries for TX and RX notifications coming * from the netmap adapter, plus one entries per ring for the * notifications coming from the application. */ poll_ctx = nm_os_malloc(sizeof(*poll_ctx) + (num_rings + 2) * sizeof(poll_ctx->entries[0])); init_poll_funcptr(&poll_ctx->wait_table, sync_kloop_poll_table_queue_proc); poll_ctx->num_entries = 2 + num_rings; poll_ctx->num_tx_rings = num_tx_rings; poll_ctx->num_rings = num_rings; poll_ctx->next_entry = 0; poll_ctx->next_wake_fun = NULL; if (direct_tx && (na->na_flags & NAF_BDG_MAYSLEEP)) { /* In direct mode, VALE txsync is called from * wake-up context, where it is not possible * to sleep. */ na->na_flags &= ~NAF_BDG_MAYSLEEP; na_could_sleep = true; } for (i = 0; i < num_rings + 2; i++) { poll_ctx->entries[i].args = args + i; poll_ctx->entries[i].parent = poll_ctx; } /* Poll for notifications coming from the applications through * eventfds. */ for (i = 0; i < num_rings; i++, poll_ctx->next_entry++) { struct eventfd_ctx *irq = NULL; struct file *filp = NULL; unsigned long mask; bool tx_ring = (i < num_tx_rings); if (eventfds_opt->eventfds[i].irqfd >= 0) { filp = eventfd_fget( eventfds_opt->eventfds[i].irqfd); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } irq = eventfd_ctx_fileget(filp); if (IS_ERR(irq)) { err = PTR_ERR(irq); goto out; } } poll_ctx->entries[i].irq_filp = filp; poll_ctx->entries[i].irq_ctx = irq; poll_ctx->entries[i].args->busy_wait = busy_wait; /* Don't let netmap_sync_kloop_*x_ring() use * IRQs in direct mode. */ poll_ctx->entries[i].args->irq_ctx = ((tx_ring && direct_tx) || (!tx_ring && direct_rx)) ? NULL : poll_ctx->entries[i].irq_ctx; poll_ctx->entries[i].args->direct = (tx_ring ? direct_tx : direct_rx); if (!busy_wait) { filp = eventfd_fget( eventfds_opt->eventfds[i].ioeventfd); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } if (tx_ring && direct_tx) { /* Override the wake up function * so that it can directly call * netmap_sync_kloop_tx_ring(). */ poll_ctx->next_wake_fun = sync_kloop_tx_kick_wake_fun; } else if (!tx_ring && direct_rx) { /* Same for direct RX. */ poll_ctx->next_wake_fun = sync_kloop_rx_kick_wake_fun; } else { poll_ctx->next_wake_fun = NULL; } mask = filp->f_op->poll(filp, &poll_ctx->wait_table); if (mask & POLLERR) { err = EINVAL; goto out; } } } /* Poll for notifications coming from the netmap rings bound to * this file descriptor. */ if (!busy_wait) { NMG_LOCK(); /* In direct mode, override the wake up function so * that it can forward the netmap_tx_irq() to the * guest. */ poll_ctx->next_wake_fun = direct_tx ? sync_kloop_tx_irq_wake_fun : NULL; poll_wait(priv->np_filp, priv->np_si[NR_TX], &poll_ctx->wait_table); poll_ctx->next_entry++; poll_ctx->next_wake_fun = direct_rx ? sync_kloop_rx_irq_wake_fun : NULL; poll_wait(priv->np_filp, priv->np_si[NR_RX], &poll_ctx->wait_table); poll_ctx->next_entry++; NMG_UNLOCK(); } #else /* SYNC_KLOOP_POLL */ opt->nro_status = EOPNOTSUPP; goto out; #endif /* SYNC_KLOOP_POLL */ } nm_prinf("kloop busy_wait %u, direct_tx %u, direct_rx %u, " "na_could_sleep %u", busy_wait, direct_tx, direct_rx, na_could_sleep); /* Main loop. */ for (;;) { if (unlikely(NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_STOPPING)) { break; } #ifdef SYNC_KLOOP_POLL if (!busy_wait) { /* It is important to set the task state as * interruptible before processing any TX/RX ring, * so that if a notification on ring Y comes after * we have processed ring Y, but before we call * schedule(), we don't miss it. This is true because * the wake up function will change the task state, * and therefore the schedule_timeout() call below * will observe the change). */ set_current_state(TASK_INTERRUPTIBLE); } #endif /* SYNC_KLOOP_POLL */ /* Process all the TX rings bound to this file descriptor. */ for (i = 0; !direct_tx && i < num_tx_rings; i++) { struct sync_kloop_ring_args *a = args + i; netmap_sync_kloop_tx_ring(a); } /* Process all the RX rings bound to this file descriptor. */ for (i = 0; !direct_rx && i < num_rx_rings; i++) { struct sync_kloop_ring_args *a = args + num_tx_rings + i; netmap_sync_kloop_rx_ring(a); } if (busy_wait) { /* Default synchronization method: sleep for a while. */ usleep_range(sleep_us, sleep_us); } #ifdef SYNC_KLOOP_POLL else { /* Yield to the scheduler waiting for a notification * to come either from netmap or the application. */ schedule_timeout(msecs_to_jiffies(3000)); } #endif /* SYNC_KLOOP_POLL */ } out: #ifdef SYNC_KLOOP_POLL if (poll_ctx) { /* Stop polling from netmap and the eventfds, and deallocate * the poll context. */ if (!busy_wait) { __set_current_state(TASK_RUNNING); } for (i = 0; i < poll_ctx->next_entry; i++) { struct sync_kloop_poll_entry *entry = poll_ctx->entries + i; if (entry->wqh) remove_wait_queue(entry->wqh, &entry->wait); /* We did not get a reference to the eventfds, but * don't do that on netmap file descriptors (since * a reference was not taken. */ if (entry->filp && entry->filp != priv->np_filp) fput(entry->filp); if (entry->irq_ctx) eventfd_ctx_put(entry->irq_ctx); if (entry->irq_filp) fput(entry->irq_filp); } nm_os_free(poll_ctx); poll_ctx = NULL; } #endif /* SYNC_KLOOP_POLL */ if (args) { nm_os_free(args); args = NULL; } /* Reset the kloop state. */ NMG_LOCK(); priv->np_kloop_state = 0; if (na_could_sleep) { na->na_flags |= NAF_BDG_MAYSLEEP; } NMG_UNLOCK(); return err; } int netmap_sync_kloop_stop(struct netmap_priv_d *priv) { struct netmap_adapter *na; bool running = true; int err = 0; if (priv->np_nifp == NULL) { return ENXIO; } mb(); /* make sure following reads are not from cache */ na = priv->np_na; if (!nm_netmap_on(na)) { return ENXIO; } /* Set the kloop stopping flag. */ NMG_LOCK(); priv->np_kloop_state |= NM_SYNC_KLOOP_STOPPING; NMG_UNLOCK(); /* Send a notification to the kloop, in case it is blocked in * schedule_timeout(). We can use either RX or TX, because the * kloop is waiting on both. */ nm_os_selwakeup(priv->np_si[NR_RX]); /* Wait for the kloop to actually terminate. */ while (running) { usleep_range(1000, 1500); NMG_LOCK(); running = (NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_RUNNING); NMG_UNLOCK(); } return err; } #ifdef WITH_PTNETMAP /* * Guest ptnetmap txsync()/rxsync() routines, used in ptnet device drivers. * These routines are reused across the different operating systems supported * by netmap. */ /* * Reconcile host and guest views of the transmit ring. * * Guest user wants to transmit packets up to the one before ring->head, * and guest kernel knows tx_ring->hwcur is the first packet unsent * by the host kernel. * * We push out as many packets as possible, and possibly * reclaim buffers from previously completed transmission. * * Notifications from the host are enabled only if the user guest would * block (no space in the ring). */ bool netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags) { bool notify = false; /* Disable notifications */ atok->appl_need_kick = 0; /* * First part: tell the host to process the new packets, * updating the CSB. */ kring->nr_hwcur = ktoa->hwcur; nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead); /* Ask for a kick from a guest to the host if needed. */ if (((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring)) && NM_ACCESS_ONCE(ktoa->kern_need_kick)) || (flags & NAF_FORCE_RECLAIM)) { atok->sync_flags = flags; notify = true; } /* * Second part: reclaim buffers for completed transmissions. */ if (nm_kr_wouldblock(kring) || (flags & NAF_FORCE_RECLAIM)) { nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur); } /* * No more room in the ring for new transmissions. The user thread will * go to sleep and we need to be notified by the host when more free * space is available. */ if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) { /* Re-enable notifications. */ atok->appl_need_kick = 1; /* Double check, with store-load memory barrier. */ nm_stld_barrier(); nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur); /* If there is new free space, disable notifications */ if (unlikely(!nm_kr_wouldblock(kring))) { atok->appl_need_kick = 0; } } nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)", kring->name, atok->head, atok->cur, ktoa->hwtail, kring->rhead, kring->rcur, kring->nr_hwtail); return notify; } /* * Reconcile host and guest view of the receive ring. * * Update hwcur/hwtail from host (reading from CSB). * * If guest user has released buffers up to the one before ring->head, we * also give them to the host. * * Notifications from the host are enabled only if the user guest would * block (no more completed slots in the ring). */ bool netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa, struct netmap_kring *kring, int flags) { bool notify = false; /* Disable notifications */ atok->appl_need_kick = 0; /* * First part: import newly received packets, by updating the kring * hwtail to the hwtail known from the host (read from the CSB). * This also updates the kring hwcur. */ nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur); kring->nr_kflags &= ~NKR_PENDINTR; /* * Second part: tell the host about the slots that guest user has * released, by updating cur and head in the CSB. */ if (kring->rhead != kring->nr_hwcur) { nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead); } /* * No more completed RX slots. The user thread will go to sleep and * we need to be notified by the host when more RX slots have been * completed. */ if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) { /* Re-enable notifications. */ atok->appl_need_kick = 1; /* Double check, with store-load memory barrier. */ nm_stld_barrier(); nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur); /* If there are new slots, disable notifications. */ if (!nm_kr_wouldblock(kring)) { atok->appl_need_kick = 0; } } /* Ask for a kick from the guest to the host if needed. */ if ((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring)) && NM_ACCESS_ONCE(ktoa->kern_need_kick)) { atok->sync_flags = flags; notify = true; } nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)", kring->name, atok->head, atok->cur, ktoa->hwtail, kring->rhead, kring->rcur, kring->nr_hwtail); return notify; } /* * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor. */ int ptnet_nm_krings_create(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; /* Upcast. */ struct netmap_adapter *na_nm = &ptna->hwup.up; struct netmap_adapter *na_dr = &ptna->dr.up; int ret; if (ptna->backend_users) { return 0; } /* Create krings on the public netmap adapter. */ ret = netmap_hw_krings_create(na_nm); if (ret) { return ret; } /* Copy krings into the netmap adapter private to the driver. */ na_dr->tx_rings = na_nm->tx_rings; na_dr->rx_rings = na_nm->rx_rings; return 0; } void ptnet_nm_krings_delete(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; /* Upcast. */ struct netmap_adapter *na_nm = &ptna->hwup.up; struct netmap_adapter *na_dr = &ptna->dr.up; if (ptna->backend_users) { return; } na_dr->tx_rings = NULL; na_dr->rx_rings = NULL; netmap_hw_krings_delete(na_nm); } void ptnet_nm_dtor(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; netmap_mem_put(ptna->dr.up.nm_mem); memset(&ptna->dr, 0, sizeof(ptna->dr)); netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp); } int netmap_pt_guest_attach(struct netmap_adapter *arg, unsigned int nifp_offset, unsigned int memid) { struct netmap_pt_guest_adapter *ptna; if_t ifp = arg ? arg->ifp : NULL; int error; /* get allocator */ arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, memid); if (arg->nm_mem == NULL) return ENOMEM; arg->na_flags |= NAF_MEM_OWNER; error = netmap_attach_ext(arg, sizeof(struct netmap_pt_guest_adapter), 1); if (error) return error; /* get the netmap_pt_guest_adapter */ ptna = (struct netmap_pt_guest_adapter *) NA(ifp); /* Initialize a separate pass-through netmap adapter that is going to * be used by the ptnet driver only, and so never exposed to netmap * applications. We only need a subset of the available fields. */ memset(&ptna->dr, 0, sizeof(ptna->dr)); ptna->dr.up.ifp = ifp; ptna->dr.up.nm_mem = netmap_mem_get(ptna->hwup.up.nm_mem); ptna->dr.up.nm_config = ptna->hwup.up.nm_config; ptna->backend_users = 0; return 0; } #endif /* WITH_PTNETMAP */