/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2026 Ruslan Bukin
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
/*
* Boundary values for the page table page index space:
*
* L3 pages: [0, NUL2E)
* L2 pages: [NUL2E, NUL2E + NUL1E)
* L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
*
* Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the
* ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
* in a set of page tables.
*/
#define NUL0E Ln_ENTRIES
#define NUL1E (Ln_ENTRIES * NUL0E)
#define NUL2E (Ln_ENTRIES * NUL1E)
#define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
#define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
#define pmap_clear(pte) pmap_store(pte, 0)
#define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits)
#define pmap_load_store(pte, entry) atomic_swap_64(pte, entry)
#define pmap_load_clear(pte) pmap_load_store(pte, 0)
#define pmap_load(pte) atomic_load_64(pte)
#define pmap_store(pte, entry) atomic_store_64(pte, entry)
#define pmap_store_bits(pte, bits) atomic_set_64(pte, bits)
#define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK)
#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK)
#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK)
#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK)
#define PTE_TO_PHYS(pte) \
((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
#define L2PTE_TO_PHYS(l2) \
((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
#define L1PTE_TO_PHYS(l1) \
((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT)
#define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
/********************/
/* Inline functions */
/********************/
static __inline pd_entry_t *
pmap_l0(struct riscv_iommu_pmap *pmap, vm_offset_t va)
{
KASSERT(pmap->pm_mode != PMAP_MODE_SV39,
("%s: in SV39 mode", __func__));
KASSERT(VIRT_IS_VALID(va),
("%s: malformed virtual address %#lx", __func__, va));
return (&pmap->pm_top[pmap_l0_index(va)]);
}
static __inline pd_entry_t *
pmap_l0_to_l1(struct riscv_iommu_pmap *pmap, pd_entry_t *l0, vm_offset_t va)
{
vm_paddr_t phys;
pd_entry_t *l1;
KASSERT(pmap->pm_mode != PMAP_MODE_SV39,
("%s: in SV39 mode", __func__));
phys = PTE_TO_PHYS(pmap_load(l0));
l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
return (&l1[pmap_l1_index(va)]);
}
static __inline pd_entry_t *
pmap_l1(struct riscv_iommu_pmap *pmap, vm_offset_t va)
{
pd_entry_t *l0;
KASSERT(VIRT_IS_VALID(va),
("%s: malformed virtual address %#lx", __func__, va));
if (pmap->pm_mode == PMAP_MODE_SV39) {
return (&pmap->pm_top[pmap_l1_index(va)]);
} else {
l0 = pmap_l0(pmap, va);
if ((pmap_load(l0) & PTE_V) == 0)
return (NULL);
if ((pmap_load(l0) & PTE_RX) != 0)
return (NULL);
return (pmap_l0_to_l1(pmap, l0, va));
}
}
static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
{
vm_paddr_t phys;
pd_entry_t *l2;
phys = PTE_TO_PHYS(pmap_load(l1));
l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
return (&l2[pmap_l2_index(va)]);
}
static __inline pd_entry_t *
pmap_l2(struct riscv_iommu_pmap *pmap, vm_offset_t va)
{
pd_entry_t *l1;
l1 = pmap_l1(pmap, va);
if (l1 == NULL)
return (NULL);
if ((pmap_load(l1) & PTE_V) == 0)
return (NULL);
if ((pmap_load(l1) & PTE_RX) != 0)
return (NULL);
return (pmap_l1_to_l2(l1, va));
}
static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
{
vm_paddr_t phys;
pt_entry_t *l3;
phys = PTE_TO_PHYS(pmap_load(l2));
l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
return (&l3[pmap_l3_index(va)]);
}
static __inline pt_entry_t *
pmap_l3(struct riscv_iommu_pmap *pmap, vm_offset_t va)
{
pd_entry_t *l2;
l2 = pmap_l2(pmap, va);
if (l2 == NULL)
return (NULL);
if ((pmap_load(l2) & PTE_V) == 0)
return (NULL);
if ((pmap_load(l2) & PTE_RX) != 0)
return (NULL);
return (pmap_l2_to_l3(l2, va));
}
#ifdef INVARIANTS
static __inline void
pmap_resident_count_inc(struct riscv_iommu_pmap *pmap, int count)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
pmap->sp_resident_count += count;
}
static __inline void
pmap_resident_count_dec(struct riscv_iommu_pmap *pmap, int count)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT(pmap->sp_resident_count >= count,
("pmap %p resident count underflow %ld %d", pmap,
pmap->sp_resident_count, count));
pmap->sp_resident_count -= count;
}
#else
static __inline void
pmap_resident_count_inc(struct riscv_iommu_pmap *pmap, int count)
{
}
static __inline void
pmap_resident_count_dec(struct riscv_iommu_pmap *pmap, int count)
{
}
#endif
/***************************************************
* Page table page management routines.....
***************************************************/
int
iommu_pmap_pinit(struct riscv_iommu_pmap *pmap, enum pmap_mode pm_mode)
{
vm_paddr_t topphys;
vm_page_t m;
m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
VM_ALLOC_WAITOK);
topphys = VM_PAGE_TO_PHYS(m);
pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
pmap->pm_mode = pm_mode;
switch (pm_mode) {
case PMAP_MODE_SV39:
pmap->pm_satp = SATP_MODE_SV39;
break;
case PMAP_MODE_SV48:
pmap->pm_satp = SATP_MODE_SV48;
break;
default:
panic("Unknown virtual memory system");
};
pmap->pm_satp |= (topphys >> PAGE_SHIFT);
#ifdef INVARIANTS
pmap->sp_resident_count = 0;
#endif
mtx_init(&pmap->pm_mtx, "iommu pmap", NULL, MTX_DEF);
return (1);
}
/*
* Release any resources held by the given physical map.
* Called when a pmap initialized by pmap_pinit is being released.
* Should only be called if the map contains no valid mappings.
*/
void
iommu_pmap_release(struct riscv_iommu_pmap *pmap)
{
vm_page_t m;
KASSERT(pmap->sp_resident_count == 0,
("pmap_release: pmap resident count %ld != 0",
pmap->sp_resident_count));
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
vm_page_unwire_noq(m);
vm_page_free_zero(m);
mtx_destroy(&pmap->pm_mtx);
}
/*
* This routine is called if the desired page table page does not exist.
*
* If page table page allocation fails, this routine may sleep before
* returning NULL. It sleeps only if a lock pointer was given.
*
* Note: If a page allocation fails at page table level two or three,
* one or two pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
*/
static vm_page_t
_pmap_alloc_l3(struct riscv_iommu_pmap *pmap, vm_pindex_t ptepindex)
{
vm_page_t m, pdpg;
pt_entry_t entry;
vm_paddr_t phys;
pn_t pn;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
/*
* Allocate a page table page.
*/
m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
if (m == NULL) {
/*
* Indicate the need to retry. While waiting, the page table
* page may have been allocated.
*/
return (NULL);
}
m->pindex = ptepindex;
/*
* Map the pagetable page into the process address space, if
* it isn't already there.
*/
pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
if (ptepindex >= NUL2E + NUL1E) {
pd_entry_t *l0;
vm_pindex_t l0index;
KASSERT(pmap->pm_mode != PMAP_MODE_SV39,
("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
("%s: pindex %#lx out of range", __func__, ptepindex));
l0index = ptepindex - (NUL2E + NUL1E);
l0 = &pmap->pm_top[l0index];
KASSERT((pmap_load(l0) & PTE_V) == 0,
("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
entry = PTE_V | (pn << PTE_PPN0_S);
pmap_store(l0, entry);
} else if (ptepindex >= NUL2E) {
pd_entry_t *l0, *l1;
vm_pindex_t l0index, l1index;
l1index = ptepindex - NUL2E;
if (pmap->pm_mode == PMAP_MODE_SV39) {
l1 = &pmap->pm_top[l1index];
} else {
l0index = l1index >> Ln_ENTRIES_SHIFT;
l0 = &pmap->pm_top[l0index];
if (pmap_load(l0) == 0) {
/* Recurse to allocate the L1 page. */
if (_pmap_alloc_l3(pmap,
NUL2E + NUL1E + l0index) == NULL)
goto fail;
phys = PTE_TO_PHYS(pmap_load(l0));
} else {
phys = PTE_TO_PHYS(pmap_load(l0));
pdpg = PHYS_TO_VM_PAGE(phys);
pdpg->ref_count++;
}
l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
l1 = &l1[ptepindex & Ln_ADDR_MASK];
}
KASSERT((pmap_load(l1) & PTE_V) == 0,
("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
entry = PTE_V | (pn << PTE_PPN0_S);
pmap_store(l1, entry);
} else {
vm_pindex_t l0index, l1index;
pd_entry_t *l0, *l1, *l2;
l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
if (pmap->pm_mode == PMAP_MODE_SV39) {
l1 = &pmap->pm_top[l1index];
if (pmap_load(l1) == 0) {
/* recurse for allocating page dir */
if (_pmap_alloc_l3(pmap, NUL2E + l1index)
== NULL)
goto fail;
} else {
pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
pdpg->ref_count++;
}
} else {
l0index = l1index >> Ln_ENTRIES_SHIFT;
l0 = &pmap->pm_top[l0index];
if (pmap_load(l0) == 0) {
/* Recurse to allocate the L1 entry. */
if (_pmap_alloc_l3(pmap, NUL2E + l1index)
== NULL)
goto fail;
phys = PTE_TO_PHYS(pmap_load(l0));
l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
l1 = &l1[l1index & Ln_ADDR_MASK];
} else {
phys = PTE_TO_PHYS(pmap_load(l0));
l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
l1 = &l1[l1index & Ln_ADDR_MASK];
if (pmap_load(l1) == 0) {
/* Recurse to allocate the L2 page. */
if (_pmap_alloc_l3(pmap,
NUL2E + l1index) == NULL)
goto fail;
} else {
pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
pdpg->ref_count++;
}
}
}
phys = PTE_TO_PHYS(pmap_load(l1));
l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
l2 = &l2[ptepindex & Ln_ADDR_MASK];
KASSERT((pmap_load(l2) & PTE_V) == 0,
("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
entry = PTE_V | (pn << PTE_PPN0_S);
pmap_store(l2, entry);
}
pmap_resident_count_inc(pmap, 1);
return (m);
fail:
vm_page_unwire_noq(m);
vm_page_free_zero(m);
return (NULL);
}
/*
* Remove a single IOMMU entry.
*/
int
iommu_pmap_remove(struct riscv_iommu_pmap *pmap, vm_offset_t va)
{
pt_entry_t *l3;
int rc;
PMAP_LOCK(pmap);
l3 = pmap_l3(pmap, va);
if (l3 != NULL) {
pmap_resident_count_dec(pmap, 1);
pmap_clear(l3);
rc = KERN_SUCCESS;
} else
rc = KERN_FAILURE;
PMAP_UNLOCK(pmap);
return (rc);
}
/* Add a single IOMMU entry. This function does not sleep. */
int
iommu_pmap_enter(struct riscv_iommu_pmap *pmap, vm_offset_t va, vm_paddr_t pa,
vm_prot_t prot, u_int flags)
{
pd_entry_t *l2, l2e;
pt_entry_t new_l3;
pt_entry_t *l3;
vm_page_t mpte;
pn_t pn;
int rv;
pn = (pa / PAGE_SIZE);
new_l3 = PTE_V | PTE_R | PTE_A;
if (prot & VM_PROT_EXECUTE)
new_l3 |= PTE_X;
if (flags & VM_PROT_WRITE)
new_l3 |= PTE_D;
if (prot & VM_PROT_WRITE)
new_l3 |= PTE_W;
if (va < VM_MAX_USER_ADDRESS)
new_l3 |= PTE_U;
new_l3 |= (pn << PTE_PPN0_S);
new_l3 |= PTE_MA_IO;
/*
* Set modified bit gratuitously for writeable mappings if
* the page is unmanaged. We do not want to take a fault
* to do the dirty bit accounting for these mappings.
*/
if (prot & VM_PROT_WRITE)
new_l3 |= PTE_D;
CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
mpte = NULL;
PMAP_LOCK(pmap);
l2 = pmap_l2(pmap, va);
if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
((l2e & PTE_RWX) == 0)) {
l3 = pmap_l2_to_l3(l2, va);
} else if (va < VM_MAXUSER_ADDRESS) {
mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va));
if (mpte == NULL) {
CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
rv = KERN_RESOURCE_SHORTAGE;
goto out;
}
l3 = pmap_l3(pmap, va);
} else
panic("pmap_enter: missing L3 table for kernel va %#lx", va);
KASSERT((pmap_load(l3) & PTE_V) == 0, ("l3 is valid"));
pmap_store(l3, new_l3);
pmap_resident_count_inc(pmap, 1);
rv = KERN_SUCCESS;
out:
PMAP_UNLOCK(pmap);
return (rv);
}
static void
iommu_pmap_remove_pages_sv48(struct riscv_iommu_pmap *pmap)
{
pd_entry_t l0e, *l1, l1e, *l2, l2e, *l3, l3e;
vm_paddr_t pa0, pa1, pa;
vm_page_t m0, m1, m;
int i, j, k, l;
PMAP_LOCK(pmap);
for (i = 0; i < Ln_ENTRIES; i++) {
l0e = pmap->pm_top[i];
if ((l0e & PTE_V) == 0)
continue;
pa0 = PTE_TO_PHYS(l0e);
m0 = PHYS_TO_VM_PAGE(pa0);
l1 = (pd_entry_t *)PHYS_TO_DMAP(pa0);
for (j = 0; j < Ln_ENTRIES; j++) {
l1e = l1[j];
if ((l1e & PTE_V) == 0)
continue;
pa1 = PTE_TO_PHYS(l1e);
m1 = PHYS_TO_VM_PAGE(pa1);
l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1);
for (k = 0; k < Ln_ENTRIES; k++) {
l2e = l2[k];
if ((l2e & PTE_V) == 0)
continue;
pa = PTE_TO_PHYS(l2e);
m = PHYS_TO_VM_PAGE(pa);
l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
for (l = 0; l < Ln_ENTRIES; l++) {
l3e = l3[l];
if ((l3e & PTE_V) == 0)
continue;
panic("%s: l3e found (idx %d %d %d %d)",
__func__, i, j, k, l);
}
vm_page_unwire_noq(m1);
vm_page_unwire_noq(m);
pmap_resident_count_dec(pmap, 1);
vm_page_free(m);
pmap_clear(&l2[k]);
}
vm_page_unwire_noq(m0);
pmap_resident_count_dec(pmap, 1);
vm_page_free(m1);
pmap_clear(&l1[j]);
}
pmap_resident_count_dec(pmap, 1);
vm_page_free(m0);
pmap_clear(&pmap->pm_top[i]);
}
KASSERT(pmap->sp_resident_count == 0,
("Invalid resident count %jd", pmap->sp_resident_count));
PMAP_UNLOCK(pmap);
}
static void
iommu_pmap_remove_pages_sv39(struct riscv_iommu_pmap *pmap)
{
pd_entry_t l1e, *l2, l2e, *l3, l3e;
vm_paddr_t pa1, pa;
vm_page_t m1, m;
int j, k, l;
PMAP_LOCK(pmap);
for (j = 0; j < Ln_ENTRIES; j++) {
l1e = pmap->pm_top[j];
if ((l1e & PTE_V) == 0)
continue;
pa1 = PTE_TO_PHYS(l1e);
m1 = PHYS_TO_VM_PAGE(pa1);
l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1);
for (k = 0; k < Ln_ENTRIES; k++) {
l2e = l2[k];
if ((l2e & PTE_V) == 0)
continue;
pa = PTE_TO_PHYS(l2e);
m = PHYS_TO_VM_PAGE(pa);
l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
for (l = 0; l < Ln_ENTRIES; l++) {
l3e = l3[l];
if ((l3e & PTE_V) == 0)
continue;
panic("%s: l3e found (idx %d %d %d)",
__func__, j, k, l);
}
vm_page_unwire_noq(m1);
vm_page_unwire_noq(m);
pmap_resident_count_dec(pmap, 1);
vm_page_free(m);
pmap_clear(&l2[k]);
}
pmap_resident_count_dec(pmap, 1);
vm_page_free(m1);
pmap_clear(&pmap->pm_top[j]);
}
KASSERT(pmap->sp_resident_count == 0,
("Invalid resident count %jd", pmap->sp_resident_count));
PMAP_UNLOCK(pmap);
}
void
iommu_pmap_remove_pages(struct riscv_iommu_pmap *pmap)
{
switch (pmap->pm_mode) {
case PMAP_MODE_SV39:
iommu_pmap_remove_pages_sv39(pmap);
break;
case PMAP_MODE_SV48:
iommu_pmap_remove_pages_sv48(pmap);
break;
default:
panic("Unknown virtual memory system");
}
}