On Mon, Mar 16, 2026 at 5:59 AM Jiri Pirko <[email protected]> wrote:
>
> From: Jiri Pirko <[email protected]>
>
> Add a new "system_cc_decrypted" dma-buf heap to allow userspace to
> allocate decrypted (shared) memory for confidential computing (CoCo)
> VMs.
>
> On CoCo VMs, guest memory is encrypted by default. The hardware uses an
> encryption bit in page table entries (C-bit on AMD SEV, "shared" bit on
> Intel TDX) to control whether a given memory access is encrypted or
> decrypted. The kernel's direct map is set up with encryption enabled,
> so pages returned by alloc_pages() are encrypted in the direct map
> by default. To make this memory usable for devices that do not support
> DMA to encrypted memory (no TDISP support), it has to be explicitly
> decrypted. A couple of things are needed to properly handle
> decrypted memory for the dma-buf use case:
>
> - set_memory_decrypted() on the direct map after allocation:
> Besides clearing the encryption bit in the direct map PTEs, this
> also notifies the hypervisor about the page state change. On free,
> the inverse set_memory_encrypted() must be called before returning
> pages to the allocator. If re-encryption fails, pages
> are intentionally leaked to prevent decrypted memory from being
> reused as private.
>
> - pgprot_decrypted() for userspace and kernel virtual mappings:
> Any new mapping of the decrypted pages, be it to userspace via
> mmap or to kernel vmalloc space via vmap, creates PTEs independent
> of the direct map. These must also have the encryption bit cleared,
> otherwise accesses through them would see encrypted (garbage) data.
>
> - DMA_ATTR_CC_DECRYPTED for DMA mapping:
> Since the pages are already decrypted, the DMA API needs to be
> informed via DMA_ATTR_CC_DECRYPTED so it can map them correctly
> as unencrypted for device access.
>
> On non-CoCo VMs, the system_cc_decrypted heap is not registered
> to prevent misuse by userspace that does not understand
> the security implications of explicitly decrypted memory.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> v2->v3:
> - removed couple of leftovers from headers
> v1->v2:
> - fixed build errors on s390 by including mem_encrypt.h
> - converted system heap flag implementation to a separate heap
> ---
> drivers/dma-buf/heaps/system_heap.c | 103 ++++++++++++++++++++++++++--
> 1 file changed, 98 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/dma-buf/heaps/system_heap.c
> b/drivers/dma-buf/heaps/system_heap.c
> index b3650d8fd651..a525e9aaaffa 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -10,17 +10,25 @@
> * Andrew F. Davis <[email protected]>
> */
>
> +#include <linux/cc_platform.h>
> #include <linux/dma-buf.h>
> #include <linux/dma-mapping.h>
> #include <linux/dma-heap.h>
> #include <linux/err.h>
> #include <linux/highmem.h>
> +#include <linux/mem_encrypt.h>
> #include <linux/mm.h>
> +#include <linux/set_memory.h>
> #include <linux/module.h>
> +#include <linux/pgtable.h>
> #include <linux/scatterlist.h>
> #include <linux/slab.h>
> #include <linux/vmalloc.h>
>
> +struct system_heap_priv {
> + bool decrypted;
> +};
Hi Jiri,
I wonder if it'd better to call this cc_decrypted (or I guess
cc_shared based on Robin's comment in the previous patch) like the DMA
attr? There's a separate effort for "restricted" heaps with TEE for
(encrypted) video playback, which doesn't involve VMs or RDMA. I think
the cc_ prefix might help avoid any confusion between the usecase here
and restricted heaps.
> +
> struct system_heap_buffer {
> struct dma_heap *heap;
> struct list_head attachments;
> @@ -29,6 +37,7 @@ struct system_heap_buffer {
> struct sg_table sg_table;
> int vmap_cnt;
> void *vaddr;
> + bool decrypted;
> };
>
> struct dma_heap_attachment {
> @@ -36,6 +45,7 @@ struct dma_heap_attachment {
> struct sg_table table;
> struct list_head list;
> bool mapped;
> + bool decrypted;
> };
>
> #define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO)
> @@ -52,6 +62,34 @@ static gfp_t order_flags[] = {HIGH_ORDER_GFP,
> HIGH_ORDER_GFP, LOW_ORDER_GFP};
> static const unsigned int orders[] = {8, 4, 0};
> #define NUM_ORDERS ARRAY_SIZE(orders)
>
> +static int system_heap_set_page_decrypted(struct page *page)
> +{
> + unsigned long addr = (unsigned long)page_address(page);
> + unsigned int nr_pages = 1 << compound_order(page);
> + int ret;
> +
> + ret = set_memory_decrypted(addr, nr_pages);
> + if (ret)
> + pr_warn_ratelimited("dma-buf system heap: failed to decrypt
> page at %p\n",
> + page_address(page));
> +
> + return ret;
> +}
> +
> +static int system_heap_set_page_encrypted(struct page *page)
> +{
> + unsigned long addr = (unsigned long)page_address(page);
> + unsigned int nr_pages = 1 << compound_order(page);
> + int ret;
> +
> + ret = set_memory_encrypted(addr, nr_pages);
> + if (ret)
> + pr_warn_ratelimited("dma-buf system heap: failed to
> re-encrypt page at %p, leaking memory\n",
> + page_address(page));
> +
> + return ret;
> +}
> +
> static int dup_sg_table(struct sg_table *from, struct sg_table *to)
> {
> struct scatterlist *sg, *new_sg;
> @@ -90,6 +128,7 @@ static int system_heap_attach(struct dma_buf *dmabuf,
> a->dev = attachment->dev;
> INIT_LIST_HEAD(&a->list);
> a->mapped = false;
> + a->decrypted = buffer->decrypted;
>
> attachment->priv = a;
>
> @@ -119,9 +158,11 @@ static struct sg_table *system_heap_map_dma_buf(struct
> dma_buf_attachment *attac
> {
> struct dma_heap_attachment *a = attachment->priv;
> struct sg_table *table = &a->table;
> + unsigned long attrs;
> int ret;
>
> - ret = dma_map_sgtable(attachment->dev, table, direction, 0);
> + attrs = a->decrypted ? DMA_ATTR_CC_DECRYPTED : 0;
> + ret = dma_map_sgtable(attachment->dev, table, direction, attrs);
> if (ret)
> return ERR_PTR(ret);
>
> @@ -188,8 +229,13 @@ static int system_heap_mmap(struct dma_buf *dmabuf,
> struct vm_area_struct *vma)
> unsigned long addr = vma->vm_start;
> unsigned long pgoff = vma->vm_pgoff;
> struct scatterlist *sg;
> + pgprot_t prot;
> int i, ret;
>
> + prot = vma->vm_page_prot;
> + if (buffer->decrypted)
> + prot = pgprot_decrypted(prot);
> +
> for_each_sgtable_sg(table, sg, i) {
> unsigned long n = sg->length >> PAGE_SHIFT;
>
> @@ -206,8 +252,7 @@ static int system_heap_mmap(struct dma_buf *dmabuf,
> struct vm_area_struct *vma)
> if (addr + size > vma->vm_end)
> size = vma->vm_end - addr;
>
> - ret = remap_pfn_range(vma, addr, page_to_pfn(page),
> - size, vma->vm_page_prot);
> + ret = remap_pfn_range(vma, addr, page_to_pfn(page), size,
> prot);
> if (ret)
> return ret;
>
> @@ -225,6 +270,7 @@ static void *system_heap_do_vmap(struct
> system_heap_buffer *buffer)
> struct page **pages = vmalloc(sizeof(struct page *) * npages);
> struct page **tmp = pages;
> struct sg_page_iter piter;
> + pgprot_t prot;
> void *vaddr;
>
> if (!pages)
> @@ -235,7 +281,10 @@ static void *system_heap_do_vmap(struct
> system_heap_buffer *buffer)
> *tmp++ = sg_page_iter_page(&piter);
> }
>
> - vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
> + prot = PAGE_KERNEL;
> + if (buffer->decrypted)
> + prot = pgprot_decrypted(prot);
> + vaddr = vmap(pages, npages, VM_MAP, prot);
> vfree(pages);
>
> if (!vaddr)
> @@ -296,6 +345,14 @@ static void system_heap_dma_buf_release(struct dma_buf
> *dmabuf)
> for_each_sgtable_sg(table, sg, i) {
> struct page *page = sg_page(sg);
>
> + /*
> + * Intentionally leak pages that cannot be re-encrypted
> + * to prevent decrypted memory from being reused.
> + */
> + if (buffer->decrypted &&
> + system_heap_set_page_encrypted(page))
> + continue;
> +
> __free_pages(page, compound_order(page));
> }
> sg_free_table(table);
> @@ -347,6 +404,8 @@ static struct dma_buf *system_heap_allocate(struct
> dma_heap *heap,
> DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> unsigned long size_remaining = len;
> unsigned int max_order = orders[0];
> + struct system_heap_priv *priv = dma_heap_get_drvdata(heap);
> + bool decrypted = priv->decrypted;
> struct dma_buf *dmabuf;
> struct sg_table *table;
> struct scatterlist *sg;
> @@ -362,6 +421,7 @@ static struct dma_buf *system_heap_allocate(struct
> dma_heap *heap,
> mutex_init(&buffer->lock);
> buffer->heap = heap;
> buffer->len = len;
> + buffer->decrypted = decrypted;
>
> INIT_LIST_HEAD(&pages);
> i = 0;
> @@ -396,6 +456,14 @@ static struct dma_buf *system_heap_allocate(struct
> dma_heap *heap,
> list_del(&page->lru);
> }
>
> + if (decrypted) {
> + for_each_sgtable_sg(table, sg, i) {
> + ret = system_heap_set_page_decrypted(sg_page(sg));
> + if (ret)
> + goto free_pages;
> + }
> + }
> +
> /* create the dmabuf */
> exp_info.exp_name = dma_heap_get_name(heap);
> exp_info.ops = &system_heap_buf_ops;
> @@ -413,6 +481,13 @@ static struct dma_buf *system_heap_allocate(struct
> dma_heap *heap,
> for_each_sgtable_sg(table, sg, i) {
> struct page *p = sg_page(sg);
>
> + /*
> + * Intentionally leak pages that cannot be re-encrypted
> + * to prevent decrypted memory from being reused.
> + */
> + if (buffer->decrypted &&
> + system_heap_set_page_encrypted(p))
> + continue;
> __free_pages(p, compound_order(p));
> }
> sg_free_table(table);
> @@ -428,6 +503,14 @@ static const struct dma_heap_ops system_heap_ops = {
> .allocate = system_heap_allocate,
> };
>
> +static struct system_heap_priv system_heap_priv = {
> + .decrypted = false,
> +};
> +
> +static struct system_heap_priv system_heap_cc_decrypted_priv = {
> + .decrypted = true,
> +};
> +
> static int __init system_heap_create(void)
> {
> struct dma_heap_export_info exp_info;
> @@ -435,8 +518,18 @@ static int __init system_heap_create(void)
>
> exp_info.name = "system";
> exp_info.ops = &system_heap_ops;
> - exp_info.priv = NULL;
> + exp_info.priv = &system_heap_priv;
> +
> + sys_heap = dma_heap_add(&exp_info);
> + if (IS_ERR(sys_heap))
> + return PTR_ERR(sys_heap);
> +
> + if (IS_ENABLED(CONFIG_HIGHMEM) ||
> + !cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> + return 0;
>
> + exp_info.name = "system_cc_decrypted";
> + exp_info.priv = &system_heap_cc_decrypted_priv;
> sys_heap = dma_heap_add(&exp_info);
> if (IS_ERR(sys_heap))
> return PTR_ERR(sys_heap);
> --
> 2.51.1
>