Re: [RFC PATCH 06/11] iommu/arm-smmu-v3: Scan leaf TTD to sync hardware dirty log

2021-02-07 Thread Keqian Zhu



On 2021/2/5 3:52, Robin Murphy wrote:
> On 2021-01-28 15:17, Keqian Zhu wrote:
>> From: jiangkunkun 
>>
>> During dirty log tracking, user will try to retrieve dirty log from
>> iommu if it supports hardware dirty log. This adds a new interface
[...]

>>   static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
>>   {
>>   unsigned long granule, page_sizes;
>> @@ -957,6 +1046,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
>>   .iova_to_phys= arm_lpae_iova_to_phys,
>>   .split_block= arm_lpae_split_block,
>>   .merge_page= arm_lpae_merge_page,
>> +.sync_dirty_log= arm_lpae_sync_dirty_log,
>>   };
>> return data;
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index f1261da11ea8..69f268069282 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -2822,6 +2822,47 @@ size_t iommu_merge_page(struct iommu_domain *domain, 
>> unsigned long iova,
>>   }
>>   EXPORT_SYMBOL_GPL(iommu_merge_page);
>>   +int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
>> + size_t size, unsigned long *bitmap,
>> + unsigned long base_iova, unsigned long bitmap_pgshift)
>> +{
>> +const struct iommu_ops *ops = domain->ops;
>> +unsigned int min_pagesz;
>> +size_t pgsize;
>> +int ret;
>> +
>> +min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
>> +
>> +if (!IS_ALIGNED(iova | size, min_pagesz)) {
>> +pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n",
>> +   iova, size, min_pagesz);
>> +return -EINVAL;
>> +}
>> +
>> +if (!ops || !ops->sync_dirty_log) {
>> +pr_err("don't support sync dirty log\n");
>> +return -ENODEV;
>> +}
>> +
>> +while (size) {
>> +pgsize = iommu_pgsize(domain, iova, size);
>> +
>> +ret = ops->sync_dirty_log(domain, iova, pgsize,
>> +  bitmap, base_iova, bitmap_pgshift);
> 
> Once again, we have a worst-of-both-worlds iteration that doesn't make much 
> sense. iommu_pgsize() essentially tells you the best supported size that an 
> IOVA range *can* be mapped with, but we're iterating a range that's already 
> mapped, so we don't know if it's relevant, and either way it may not bear any 
> relation to the granularity of the bitmap, which is presumably what actually 
> matters.
> 
> Logically, either we should iterate at the bitmap granularity here, and the 
> driver just says whether the given iova chunk contains any dirty pages or 
> not, or we just pass everything through to the driver and let it do the whole 
> job itself. Doing a little bit of both is just an overcomplicated mess.
> 
> I'm skimming patch #7 and pretty much the same comments apply, so I can't be 
> bothered to repeat them there...
> 
> Robin.
Sorry that I missed these comments...

As I clarified in #4, due to unsuitable variable name, the @pgsize actually is 
the max size that meets alignment acquirement and fits into the pgsize_bitmap.

All iommu interfaces acquire the @size fits into pgsize_bitmap to simplify 
their implementation. And the logic is very similar to "unmap" here.

Thanks,
Keqian

> 
>> +if (ret)
>> +break;
>> +
>> +pr_debug("dirty_log_sync: iova 0x%lx pagesz 0x%zx\n", iova,
>> + pgsize);
>> +
>> +iova += pgsize;
>> +size -= pgsize;
>> +}
>> +
>> +return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_sync_dirty_log);
>> +
>>   void iommu_get_resv_regions(struct device *dev, struct list_head *list)
>>   {
>>   const struct iommu_ops *ops = dev->bus->iommu_ops;
>> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
>> index 754b62a1bbaf..f44551e4a454 100644
>> --- a/include/linux/io-pgtable.h
>> +++ b/include/linux/io-pgtable.h
>> @@ -166,6 +166,10 @@ struct io_pgtable_ops {
>> size_t size);
>>   size_t (*merge_page)(struct io_pgtable_ops *ops, unsigned long iova,
>>phys_addr_t phys, size_t size, int prot);
>> +int (*sync_dirty_log)(struct io_pgtable_ops *ops,
>> +  unsigned long iova, size_t size,
>> +  unsigned long *bitmap, unsigned long base_iova,
>> +  unsigned long bitmap_pgshift);
>>   };
>> /**
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index ac2b0b1bce0f..8069c8375e63 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -262,6 +262,10 @@ struct iommu_ops {
>> size_t size);
>>   size_t (*merge_page)(struct iommu_domain *domain, unsigned long iova,
>>phys_addr_t phys, size_t size, int prot);
>> +int (*sync_dirty_log)(struct iommu_domain *domain,
>> +  unsigned long iova, size_t size,
>> +  unsigned long *bitmap, unsigned long base_iova,
>> +  unsigned long bitmap_pgshift);
>> /* Request/Free a list of reserved regions for

Re: [RFC PATCH 10/11] vfio/iommu_type1: Optimize dirty bitmap population based on iommu HWDBM

2021-02-07 Thread Yi Sun
Hi,

On 21-01-28 23:17:41, Keqian Zhu wrote:

[...]

> +static void vfio_dma_dirty_log_start(struct vfio_iommu *iommu,
> +  struct vfio_dma *dma)
> +{
> + struct vfio_domain *d;
> +
> + list_for_each_entry(d, &iommu->domain_list, next) {
> + /* Go through all domain anyway even if we fail */
> + iommu_split_block(d->domain, dma->iova, dma->size);
> + }
> +}

This should be a switch to prepare for dirty log start. Per Intel
Vtd spec, there is SLADE defined in Scalable-Mode PASID Table Entry.
It enables Accessed/Dirty Flags in second-level paging entries.
So, a generic iommu interface here is better. For Intel iommu, it
enables SLADE. For ARM, it splits block.

> +
> +static void vfio_dma_dirty_log_stop(struct vfio_iommu *iommu,
> + struct vfio_dma *dma)
> +{
> + struct vfio_domain *d;
> +
> + list_for_each_entry(d, &iommu->domain_list, next) {
> + /* Go through all domain anyway even if we fail */
> + iommu_merge_page(d->domain, dma->iova, dma->size,
> +  d->prot | dma->prot);
> + }
> +}

Same as above comment, a generic interface is required here.

> +
> +static void vfio_iommu_dirty_log_switch(struct vfio_iommu *iommu, bool start)
> +{
> + struct rb_node *n;
> +
> + /* Split and merge even if all iommu don't support HWDBM now */
> + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
> + struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> +
> + if (!dma->iommu_mapped)
> + continue;
> +
> + /* Go through all dma range anyway even if we fail */
> + if (start)
> + vfio_dma_dirty_log_start(iommu, dma);
> + else
> + vfio_dma_dirty_log_stop(iommu, dma);
> + }
> +}
> +
>  static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>   unsigned long arg)
>  {
> @@ -2812,8 +2900,10 @@ static int vfio_iommu_type1_dirty_pages(struct 
> vfio_iommu *iommu,
>   pgsize = 1 << __ffs(iommu->pgsize_bitmap);
>   if (!iommu->dirty_page_tracking) {
>   ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
> - if (!ret)
> + if (!ret) {
>   iommu->dirty_page_tracking = true;
> + vfio_iommu_dirty_log_switch(iommu, true);
> + }
>   }
>   mutex_unlock(&iommu->lock);
>   return ret;
> @@ -2822,6 +2912,7 @@ static int vfio_iommu_type1_dirty_pages(struct 
> vfio_iommu *iommu,
>   if (iommu->dirty_page_tracking) {
>   iommu->dirty_page_tracking = false;
>   vfio_dma_bitmap_free_all(iommu);
> + vfio_iommu_dirty_log_switch(iommu, false);
>   }
>   mutex_unlock(&iommu->lock);
>   return 0;
> -- 
> 2.19.1
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 06/11] iommu/arm-smmu-v3: Scan leaf TTD to sync hardware dirty log

2021-02-07 Thread Keqian Zhu
Hi Robin,

On 2021/2/5 3:52, Robin Murphy wrote:
> On 2021-01-28 15:17, Keqian Zhu wrote:
>> From: jiangkunkun 
>>
>> During dirty log tracking, user will try to retrieve dirty log from
>> iommu if it supports hardware dirty log. This adds a new interface
>> named sync_dirty_log in iommu layer and arm smmuv3 implements it,
>> which scans leaf TTD and treats it's dirty if it's writable (As we
>> just enable HTTU for stage1, so check AP[2] is not set).
>>
>> Co-developed-by: Keqian Zhu 
>> Signed-off-by: Kunkun Jiang 
>> ---
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 27 +++
>>   drivers/iommu/io-pgtable-arm.c  | 90 +
>>   drivers/iommu/iommu.c   | 41 ++
>>   include/linux/io-pgtable.h  |  4 +
>>   include/linux/iommu.h   | 17 
>>   5 files changed, 179 insertions(+)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 2434519e4bb6..43d0536b429a 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -2548,6 +2548,32 @@ static size_t arm_smmu_merge_page(struct iommu_domain 
>> *domain, unsigned long iov
>>   return ops->merge_page(ops, iova, paddr, size, prot);
>>   }
>>   +static int arm_smmu_sync_dirty_log(struct iommu_domain *domain,
>> +   unsigned long iova, size_t size,
>> +   unsigned long *bitmap,
>> +   unsigned long base_iova,
>> +   unsigned long bitmap_pgshift)
>> +{
>> +struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> +struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
>> +
>> +if (!(smmu->features & ARM_SMMU_FEAT_HTTU_HD)) {
>> +dev_err(smmu->dev, "don't support HTTU_HD and sync dirty log\n");
>> +return -EPERM;
>> +}
>> +
>> +if (!ops || !ops->sync_dirty_log) {
>> +pr_err("don't support sync dirty log\n");
>> +return -ENODEV;
>> +}
>> +
>> +/* To ensure all inflight transactions are completed */
>> +arm_smmu_flush_iotlb_all(domain);
> 
> What about transactions that arrive between the point that this completes, 
> and the point - potentially much later - that we actually access any given 
> PTE during the walk? I don't see what this is supposed to be synchronising 
> against, even if it were just a CMD_SYNC (I especially don't see why we'd 
> want to knock out the TLBs).
The idea is that pgtable may be updated by HTTU *before* or *after* actual DMA 
access.

1) For PCI ATS. As SMMU spec (3.13.6.1 Hardware flag update for ATS & PRI) 
states:

"In addition to the behavior that is described earlier in this section, if 
hardware-management of Dirty state is enabled
and an ATS request for write access (with NW == 0) is made to a page that is 
marked Writable Clean, the SMMU
assumes a write will be made to that page and marks the page as Writable Dirty 
before returning the ATS response
that grants write access. When this happens, the modification to the page data 
by a device is not visible before
the page state is visible as Writable Dirty."

The problem is that guest memory may be dirtied *after* we actually handle it.

2) For inflight DMA. As SMMU spec (3.13.4 HTTU behavior summary) states:

"In addition, the completion of a TLB invalidation operation makes TTD updates 
that were caused by
transactions that are themselves completed by the completion of the TLB 
invalidation visible. Both
broadcast and explicit CMD_TLBI_* invalidations have this property."

The problem is that we should flush all dma transaction after guest stop.



The key to solve these problems is that we should invalidate related TLB.
1) TLBI can flush inflight dma translation (before dirty_log_sync()).
2) If a DMA translation uses ATC and occurs after we have handle dirty memory, 
then the ATC has been invalidated, so this will remark page as dirty (in 
dirty_log_clear()).

Thanks,
Keqian

> 
>> +
>> +return ops->sync_dirty_log(ops, iova, size, bitmap,
>> +base_iova, bitmap_pgshift);
>> +}
>> +
>>   static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args 
>> *args)
>>   {
>>   return iommu_fwspec_add_ids(dev, args->args, 1);
>> @@ -2649,6 +2675,7 @@ static struct iommu_ops arm_smmu_ops = {
>>   .domain_set_attr= arm_smmu_domain_set_attr,
>>   .split_block= arm_smmu_split_block,
>>   .merge_page= arm_smmu_merge_page,
>> +.sync_dirty_log= arm_smmu_sync_dirty_log,
>>   .of_xlate= arm_smmu_of_xlate,
>>   .get_resv_regions= arm_smmu_get_resv_regions,
>>   .put_resv_regions= generic_iommu_put_resv_regions,
>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
>> index 17390f258eb1..6cfe1ef3fedd 100644
>> --- a/drivers/iommu/io-pgtable-arm.c
>> +++ b/drivers/iommu/io-pgtable-arm.c
>> @@ -877,6 +877,95 @@ 

Re: [RFC PATCH 05/11] iommu/arm-smmu-v3: Merge a span of page to block descriptor

2021-02-07 Thread Keqian Zhu
Hi Robin,

On 2021/2/5 3:52, Robin Murphy wrote:
> On 2021-01-28 15:17, Keqian Zhu wrote:
>> From: jiangkunkun 
>>
>> When stop dirty log tracking, we need to recover all block descriptors
>> which are splited when start dirty log tracking. This adds a new
>> interface named merge_page in iommu layer and arm smmuv3 implements it,
>> which reinstall block mappings and unmap the span of page mappings.
>>
>> It's caller's duty to find contiuous physical memory.
>>
>> During merging page, other interfaces are not expected to be working,
>> so race condition does not exist. And we flush all iotlbs after the merge
>> procedure is completed to ease the pressure of iommu, as we will merge a
>> huge range of page mappings in general.
> 
> Again, I think we need better reasoning than "race conditions don't exist 
> because we don't expect them to exist".
Sure, because they can't. ;-)

> 
>> Co-developed-by: Keqian Zhu 
>> Signed-off-by: Kunkun Jiang 
>> ---
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 20 ++
>>   drivers/iommu/io-pgtable-arm.c  | 78 +
>>   drivers/iommu/iommu.c   | 75 
>>   include/linux/io-pgtable.h  |  2 +
>>   include/linux/iommu.h   | 10 +++
>>   5 files changed, 185 insertions(+)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 5469f4fca820..2434519e4bb6 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -2529,6 +2529,25 @@ static size_t arm_smmu_split_block(struct 
>> iommu_domain *domain,
>>   return ops->split_block(ops, iova, size);
>>   }
[...]

>> +
>> +size_t iommu_merge_page(struct iommu_domain *domain, unsigned long iova,
>> +size_t size, int prot)
>> +{
>> +phys_addr_t phys;
>> +dma_addr_t p, i;
>> +size_t cont_size, merged_size;
>> +size_t merged = 0;
>> +
>> +while (size) {
>> +phys = iommu_iova_to_phys(domain, iova);
>> +cont_size = PAGE_SIZE;
>> +p = phys + cont_size;
>> +i = iova + cont_size;
>> +
>> +while (cont_size < size && p == iommu_iova_to_phys(domain, i)) {
>> +p += PAGE_SIZE;
>> +i += PAGE_SIZE;
>> +cont_size += PAGE_SIZE;
>> +}
>> +
>> +merged_size = __iommu_merge_page(domain, iova, phys, cont_size,
>> +prot);
> 
> This is incredibly silly. The amount of time you'll spend just on walking the 
> tables in all those iova_to_phys() calls is probably significantly more than 
> it would take the low-level pagetable code to do the entire operation for 
> itself. At this level, any knowledge of how mappings are actually constructed 
> is lost once __iommu_map() returns, so we just don't know, and for this 
> operation in particular there seems little point in trying to guess - the 
> driver backend still has to figure out whether something we *think* might me 
> mergeable actually is, so it's better off doing the entire operation in a 
> single pass by itself.
>
> There's especially little point in starting all this work *before* checking 
> that it's even possible...
>
> Robin.

Well, this looks silly indeed. But the iova->phys info is only stored in 
pgtable. It seems that there is no other method to find continuous physical 
address :-( (actually, the vfio_iommu_replay() has similar logic).

We put the finding procedure of continuous physical address in common iommu 
layer, because this is a common logic for all types of iommu driver.

If a vendor iommu driver thinks (iova, phys, cont_size) is not merge-able, it 
can make its own decision to map them. This keeps same as iommu_map(), which 
provides (iova, paddr, pgsize) to vendor driver, and vendor driver can make its 
own decision to map them.

Do I understand your idea correctly?

Thanks,
Keqian
> 
>> +iova += merged_size;
>> +size -= merged_size;
>> +merged += merged_size;
>> +
>> +if (merged_size != cont_size)
>> +break;
>> +}
>> +iommu_flush_iotlb_all(domain);
>> +
>> +return merged;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_merge_page);
>> +
>>   void iommu_get_resv_regions(struct device *dev, struct list_head *list)
>>   {
>>   const struct iommu_ops *ops = dev->bus->iommu_ops;
>> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
>> index b87c6f4ecaa2..754b62a1bbaf 100644
>> --- a/include/linux/io-pgtable.h
>> +++ b/include/linux/io-pgtable.h
>> @@ -164,6 +164,8 @@ struct io_pgtable_ops {
>>   unsigned long iova);
>>   size_t (*split_block)(struct io_pgtable_ops *ops, unsigned long iova,
>> size_t size);
>> +size_t (*merge_page)(struct io_pgtable_ops *ops, unsigned long iova,
>> + phys_addr_t phys, size_t size, int prot);
>>   };
>> /**
>> diff --git a/include/linux/iommu.h b/include/li

Re: [RFC PATCH 10/11] vfio/iommu_type1: Optimize dirty bitmap population based on iommu HWDBM

2021-02-07 Thread Keqian Zhu
Hi Yi,

On 2021/2/7 17:56, Yi Sun wrote:
> Hi,
> 
> On 21-01-28 23:17:41, Keqian Zhu wrote:
> 
> [...]
> 
>> +static void vfio_dma_dirty_log_start(struct vfio_iommu *iommu,
>> + struct vfio_dma *dma)
>> +{
>> +struct vfio_domain *d;
>> +
>> +list_for_each_entry(d, &iommu->domain_list, next) {
>> +/* Go through all domain anyway even if we fail */
>> +iommu_split_block(d->domain, dma->iova, dma->size);
>> +}
>> +}
> 
> This should be a switch to prepare for dirty log start. Per Intel
> Vtd spec, there is SLADE defined in Scalable-Mode PASID Table Entry.
> It enables Accessed/Dirty Flags in second-level paging entries.
> So, a generic iommu interface here is better. For Intel iommu, it
> enables SLADE. For ARM, it splits block.
Indeed, a generic interface name is better.

The vendor iommu driver plays vendor's specific actions to start dirty log, and 
Intel iommu and ARM smmu may differ. Besides, we may add more actions in ARM 
smmu driver in future.

One question: Though I am not familiar with Intel iommu, I think it also should 
split block mapping besides enable SLADE. Right?

Thanks,
Keqian
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 04/11] iommu/arm-smmu-v3: Split block descriptor to a span of page

2021-02-07 Thread Keqian Zhu
Hi Robin,

On 2021/2/5 3:51, Robin Murphy wrote:
> On 2021-01-28 15:17, Keqian Zhu wrote:
>> From: jiangkunkun 
>>
>> Block descriptor is not a proper granule for dirty log tracking. This
>> adds a new interface named split_block in iommu layer and arm smmuv3
>> implements it, which splits block descriptor to an equivalent span of
>> page descriptors.
>>
>> During spliting block, other interfaces are not expected to be working,
>> so race condition does not exist. And we flush all iotlbs after the split
>> procedure is completed to ease the pressure of iommu, as we will split a
>> huge range of block mappings in general.
> 
> "Not expected to be" is not the same thing as "can not". Presumably the whole 
> point of dirty log tracking is that it can be run speculatively in the 
> background, so is there any actual guarantee that the guest can't, say, issue 
> a hotplug event that would cause some memory to be released back to the host 
> and unmapped while a scan might be in progress? Saying effectively "there is 
> no race condition as long as you assume there is no race condition" isn't all 
> that reassuring...
Sorry for my inaccuracy expression. "Not expected to be" is inappropriate here, 
the actual meaning is "can not".

As the only user of these newly added interfaces is vfio_iommu_type1 for now, 
and vfio_iommu_type1 always acquires "iommu->lock" before invoke them.

> 
> That said, it's not very clear why patches #4 and #5 are here at all, given 
> that patches #6 and #7 appear quite happy to handle block entries.
Split block into page is very important for dirty page tracking. Page mapping 
can greatly reduce the amount of dirty memory handling. The KVM mmu stage2 side 
also has this logic.

Yes, #6 (log_sync) and #7 (log_clear) is designed to be applied for both block 
and page mapping. As the "split" operation may fail (e.g, without BBML1/2 or 
ENOMEM), but we can still track dirty at block granule, which is still a much 
better choice compared to the full dirty policy.

> 
>> Co-developed-by: Keqian Zhu 
>> Signed-off-by: Kunkun Jiang 
>> ---
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  20 
>>   drivers/iommu/io-pgtable-arm.c  | 122 
>>   drivers/iommu/iommu.c   |  40 +++
>>   include/linux/io-pgtable.h  |   2 +
>>   include/linux/iommu.h   |  10 ++
>>   5 files changed, 194 insertions(+)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 9208881a571c..5469f4fca820 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -2510,6 +2510,25 @@ static int arm_smmu_domain_set_attr(struct 
>> iommu_domain *domain,
>>   return ret;
>>   }
>>   +static size_t arm_smmu_split_block(struct iommu_domain *domain,
>> +   unsigned long iova, size_t size)
>> +{
>> +struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
>> +struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> +
>> +if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) {
>> +dev_err(smmu->dev, "don't support BBML1/2 and split block\n");
>> +return 0;
>> +}
>> +
>> +if (!ops || !ops->split_block) {
>> +pr_err("don't support split block\n");
>> +return 0;
>> +}
>> +
>> +return ops->split_block(ops, iova, size);
>> +}
>> +
>>   static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args 
>> *args)
>>   {
>>   return iommu_fwspec_add_ids(dev, args->args, 1);
>> @@ -2609,6 +2628,7 @@ static struct iommu_ops arm_smmu_ops = {
>>   .device_group= arm_smmu_device_group,
>>   .domain_get_attr= arm_smmu_domain_get_attr,
>>   .domain_set_attr= arm_smmu_domain_set_attr,
>> +.split_block= arm_smmu_split_block,
>>   .of_xlate= arm_smmu_of_xlate,
>>   .get_resv_regions= arm_smmu_get_resv_regions,
>>   .put_resv_regions= generic_iommu_put_resv_regions,
>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
>> index e299a44808ae..f3b7f7115e38 100644
>> --- a/drivers/iommu/io-pgtable-arm.c
>> +++ b/drivers/iommu/io-pgtable-arm.c
>> @@ -79,6 +79,8 @@
>>   #define ARM_LPAE_PTE_SH_IS(((arm_lpae_iopte)3) << 8)
>>   #define ARM_LPAE_PTE_NS(((arm_lpae_iopte)1) << 5)
>>   #define ARM_LPAE_PTE_VALID(((arm_lpae_iopte)1) << 0)
>> +/* Block descriptor bits */
>> +#define ARM_LPAE_PTE_NT(((arm_lpae_iopte)1) << 16)
>> #define ARM_LPAE_PTE_ATTR_LO_MASK(((arm_lpae_iopte)0x3ff) << 2)
>>   /* Ignore the contiguous bit for block splitting */
>> @@ -679,6 +681,125 @@ static phys_addr_t arm_lpae_iova_to_phys(struct 
>> io_pgtable_ops *ops,
>>   return iopte_to_paddr(pte, data) | iova;
>>   }
>>   +static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data,
>> +