On Wed, Sep 03, 2025 at 02:46:28PM -0300, Jason Gunthorpe wrote:
> +/**
> + * pt_entry_oa_lg2sz() - Return the size of a OA entry
an OA
> + * @pts: Entry to query
> + *
> + * If the entry is not contiguous this returns pt_table_item_lg2sz(),
> otherwise
> + * it returns the total VA/OA size of the entire contiguous entry.
> + */
> +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
> +{
> + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
> +}
------
> + * level
> + * The number of table hops from the lowest leaf. Level 0
> + * is always a table of only leaves of the least significant VA bits. The
Hmm, I am a bit confused here. I thought "leaf" was meant to be a
"leaf" table? But here "a table of only leaves" makes it feel like
a "leaf" table entry?
Also, isn't "the least significant VA bits" the page offset?
> + * table
> + * A linear array of entries representing the translation items for that
> + * level.
> + * index
> + * The position in a table of an element: item = table[index]
> + * item
> + * A single position in a table
> + * entry
> + * A single logical element in a table. If contiguous pages are not
> + * supported then item and entry are the same thing, otherwise entry
> refers
> + * to the all the items that comprise a single contiguous translation.
So, an "entry" is a group of "items" if contiguous pages (huge
page?) are supported. Then, the "entry" sounds like a physical
(v.s. "logical") table entry, e.g. a PTE that we usually say?
> +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
> +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry)
> +{
> + u64 *entryp = pt_cur_table(pts, u64) + pts->index;
> + u64 old_entry = pts->entry;
> + bool ret;
> +
> + /*
> + * Ensure the zero'd table content itself is visible before its PTE can
> + * be. release is a NOP on !SMP, but the HW is still doing an acquire.
> + */
> + if (!IS_ENABLED(CONFIG_SMP))
> + dma_wmb();
Mind elaborating why SMP doesn't need this?
> +/*
> + * PT_WARN_ON is used for invariants that the kunit should be checking can't
> + * happen.
> + */
> +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
> +#define PT_WARN_ON WARN_ON
> +#else
> +static inline bool PT_WARN_ON(bool condition)
> +{
> + return false;
Should it "return condition"?
Otherwise, these validations wouldn't be effective?
drivers/iommu/generic_pt/pt_iter.h:388: if (PT_WARN_ON(!pts->table_lower))
drivers/iommu/generic_pt/pt_iter.h-389- return -EINVAL;
--
drivers/iommu/generic_pt/pt_iter.h-429-
drivers/iommu/generic_pt/pt_iter.h:430: if (PT_WARN_ON(!pt_can_have_table(pts))
||
drivers/iommu/generic_pt/pt_iter.h:431: PT_WARN_ON(!pts->table_lower))
drivers/iommu/generic_pt/pt_iter.h-432- return -EINVAL;
> +/**
> + * pt_load_entry() - Read from the location pts points at into the pts
> + * @pts: Table index to load
> + *
> + * Set the type of entry that was loaded. pts->entry and pts->table_lower
> + * will be filled in with the entry's content.
> + */
> +static inline void pt_load_entry(struct pt_state *pts)
> +{
> + pts->type = pt_load_entry_raw(pts);
> + if (pts->type == PT_ENTRY_TABLE)
> + pts->table_lower = pt_table_ptr(pts);
> +}
I see a couple of callers check pts->type. Maybe it could return
pts->type, matching with pt_load_entry_raw()?
> diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h
> b/drivers/iommu/generic_pt/pt_fmt_defaults.h
> new file mode 100644
> index 00000000000000..19e8f820c1dccf
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
> @@ -0,0 +1,193 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + *
> + * Default definitions for formats that don't define these functions.
> + */
> +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
> +#define __GENERIC_PT_PT_FMT_DEFAULTS_H
> +
> +#include "pt_defs.h"
> +#include <linux/log2.h>
<> precedes ""
> +#ifndef pt_pgsz_lg2_to_level
> +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
> + unsigned int pgsize_lg2)
> +{
> + return (pgsize_lg2 - PT_GRANULE_LG2SZ) /
> + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
> + return 0;
"return 0" should likely be dropped.
> +/*
> + * Format supplies either:
> + * pt_entry_oa - OA is at the start of a contiguous entry
> + * or
> + * pt_item_oa - OA is correct for every item in a contiguous entry
What does the "correct" mean here?
> +/**
> + * pt_range_to_end_index() - Ending index iteration
> + * @pts: Iteration State
> + *
> + * Return: the last index for the iteration in pts.
> + */
> +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
> +{
> + unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
> + struct pt_range *range = pts->range;
> + unsigned int num_entries_lg2;
> +
> + if (range->va == range->last_va)
> + return pts->index + 1;
> +
> + if (pts->range->top_level == pts->level)
> + return log2_div(fvalog2_mod(pts->range->last_va,
> + pts->range->max_vasz_lg2),
> + isz_lg2) +
> + 1;
How about:
return 1 + log2_div(...);
?
> +static __always_inline struct pt_range _pt_top_range(struct pt_common
> *common,
> + uintptr_t top_of_table)
> +{
> + struct pt_range range = {
> + .common = common,
> + .top_table =
> + (struct pt_table_p *)(top_of_table &
> + ~(uintptr_t)PT_TOP_LEVEL_MASK),
> + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
Since top_level is unsigned, would it be faster to do bitwise:
.top_level = top_of_table & PT_TOP_LEVEL_MASK,
?
> +/*
/**
> + * pt_walk_descend_all() - Recursively invoke the walker for a table item
> + * @parent_pts: Iteration State
> + * @fn: Walker function to call
> + * @arg: Value to pass to the function
> + *
> + * With pts pointing at a table item this will descend and over the entire
> lower
> + * table. This creates a new walk and does not alter pts or pts->range.
> + */
> +static __always_inline int
> +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
> + void *arg)
-------
> +/**
> + * pt_compute_best_pgsize() - Determine the best page size for leaf entries
> + * @pgsz_bitmap: Permitted page sizes
> + * @va: Starting virtual address for the leaf entry
> + * @last_va: Last virtual address for the leaf entry, sets the max page size
> + * @oa: Starting output address for the leaf entry
> + *
> + * Compute the largest page size for va, last_va, and oa together and return
> it
> + * in lg2. The largest page size depends on the format's supported page
> sizes at
> + * this level, and the relative alignment of the VA and OA addresses. 0 means
> + * the OA cannot be stored with the provided pgsz_bitmap.
> + */
> +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
> + pt_vaddr_t va,
> + pt_vaddr_t last_va,
> + pt_oaddr_t oa)
> +{
> + unsigned int best_pgsz_lg2;
> + unsigned int pgsz_lg2;
> + pt_vaddr_t len = last_va - va + 1;
> + pt_vaddr_t mask;
> +
> + if (PT_WARN_ON(va >= last_va))
> + return 0;
> +
> + /*
> + * Given a VA/OA pair the best page size is the largest page side
> + * where:
> + *
> + * 1) VA and OA start at the page. Bitwise this is the count of least
> + * significant 0 bits.
> + * This also implies that last_va/oa has the same prefix as va/oa.
> + */
> + mask = va | oa;
> +
> + /*
> + * 2) The page size is not larger than the last_va (length). Since page
> + * sizes are always power of two this can't be larger than the
> + * largest power of two factor of the length.
> + */
> + mask |= log2_to_int(log2_fls(len) - 1);
> +
> + best_pgsz_lg2 = log2_ffs(mask);
> +
> + /* Choose the higest bit <= best_pgsz_lg2 */
highest
> +/* Compute a */
> +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2)))
> +static_assert(log2_to_int_t(unsigned int, 0) == 1);
> +
> +/* Compute a - 1 (aka all low bits set) */
> +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) -
> 1))
> +
> +/* Compute a / b */
> +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2)))
> +static_assert(log2_div_t(unsigned int, 4, 2) == 1);
I skimmed through these macros and its callers. They are mostly
dealing with VA, OA, and mask, which feels like straightforward
bit-masking/shifting operations.
E.g.
log2_to_int_t = 64bit ? BIT_ULL(lg2) : BIT(lg2);
log2_to_max_int_t = 64bit ? GENMASK_ULL(lg2 - 1, 0) : GENMASK(lg2 - 1, 0);
What's the benefit from making them as arithmetic ones?
> diff --git a/include/linux/generic_pt/common.h
> b/include/linux/generic_pt/common.h
> new file mode 100644
> index 00000000000000..a29bdd7b244de6
> --- /dev/null
> +++ b/include/linux/generic_pt/common.h
> @@ -0,0 +1,134 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + */
> +#ifndef __GENERIC_PT_COMMON_H
> +#define __GENERIC_PT_COMMON_H
> +
> +#include <linux/types.h>
> +#include <linux/build_bug.h>
> +#include <linux/bits.h>
In alphabetical order.
Thanks
Nicolin