Re: [RFC PATCH 04/11] iommu/arm-smmu-v3: Split block descriptor to a span of page

2021-02-07 Thread Keqian Zhu
Hi Robin,

On 2021/2/5 3:51, Robin Murphy wrote:
> On 2021-01-28 15:17, Keqian Zhu wrote:
>> From: jiangkunkun 
>>
>> Block descriptor is not a proper granule for dirty log tracking. This
>> adds a new interface named split_block in iommu layer and arm smmuv3
>> implements it, which splits block descriptor to an equivalent span of
>> page descriptors.
>>
>> During spliting block, other interfaces are not expected to be working,
>> so race condition does not exist. And we flush all iotlbs after the split
>> procedure is completed to ease the pressure of iommu, as we will split a
>> huge range of block mappings in general.
> 
> "Not expected to be" is not the same thing as "can not". Presumably the whole 
> point of dirty log tracking is that it can be run speculatively in the 
> background, so is there any actual guarantee that the guest can't, say, issue 
> a hotplug event that would cause some memory to be released back to the host 
> and unmapped while a scan might be in progress? Saying effectively "there is 
> no race condition as long as you assume there is no race condition" isn't all 
> that reassuring...
Sorry for my inaccuracy expression. "Not expected to be" is inappropriate here, 
the actual meaning is "can not".

As the only user of these newly added interfaces is vfio_iommu_type1 for now, 
and vfio_iommu_type1 always acquires "iommu->lock" before invoke them.

> 
> That said, it's not very clear why patches #4 and #5 are here at all, given 
> that patches #6 and #7 appear quite happy to handle block entries.
Split block into page is very important for dirty page tracking. Page mapping 
can greatly reduce the amount of dirty memory handling. The KVM mmu stage2 side 
also has this logic.

Yes, #6 (log_sync) and #7 (log_clear) is designed to be applied for both block 
and page mapping. As the "split" operation may fail (e.g, without BBML1/2 or 
ENOMEM), but we can still track dirty at block granule, which is still a much 
better choice compared to the full dirty policy.

> 
>> Co-developed-by: Keqian Zhu 
>> Signed-off-by: Kunkun Jiang 
>> ---
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  20 
>>   drivers/iommu/io-pgtable-arm.c  | 122 
>>   drivers/iommu/iommu.c   |  40 +++
>>   include/linux/io-pgtable.h  |   2 +
>>   include/linux/iommu.h   |  10 ++
>>   5 files changed, 194 insertions(+)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 9208881a571c..5469f4fca820 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -2510,6 +2510,25 @@ static int arm_smmu_domain_set_attr(struct 
>> iommu_domain *domain,
>>   return ret;
>>   }
>>   +static size_t arm_smmu_split_block(struct iommu_domain *domain,
>> +   unsigned long iova, size_t size)
>> +{
>> +struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
>> +struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> +
>> +if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) {
>> +dev_err(smmu->dev, "don't support BBML1/2 and split block\n");
>> +return 0;
>> +}
>> +
>> +if (!ops || !ops->split_block) {
>> +pr_err("don't support split block\n");
>> +return 0;
>> +}
>> +
>> +return ops->split_block(ops, iova, size);
>> +}
>> +
>>   static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args 
>> *args)
>>   {
>>   return iommu_fwspec_add_ids(dev, args->args, 1);
>> @@ -2609,6 +2628,7 @@ static struct iommu_ops arm_smmu_ops = {
>>   .device_group= arm_smmu_device_group,
>>   .domain_get_attr= arm_smmu_domain_get_attr,
>>   .domain_set_attr= arm_smmu_domain_set_attr,
>> +.split_block= arm_smmu_split_block,
>>   .of_xlate= arm_smmu_of_xlate,
>>   .get_resv_regions= arm_smmu_get_resv_regions,
>>   .put_resv_regions= generic_iommu_put_resv_regions,
>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
>> index e299a44808ae..f3b7f7115e38 100644
>> --- a/drivers/iommu/io-pgtable-arm.c
>> +++ b/drivers/iommu/io-pgtable-arm.c
>> @@ -79,6 +79,8 @@
>>   #define ARM_LPAE_PTE_SH_IS(((arm_lpae_iopte)3) << 8)
>>   #define ARM_LPAE_PTE_NS(((arm_lpae_iopte)1) << 5)
>>   #define ARM_LPAE_PTE_VALID(((arm_lpae_iopte)1) << 0)
>> +/* Block descriptor bits */
>> +#define ARM_LPAE_PTE_NT(((arm_lpae_iopte)1) << 16)
>> #define ARM_LPAE_PTE_ATTR_LO_MASK(((arm_lpae_iopte)0x3ff) << 2)
>>   /* Ignore the contiguous bit for block splitting */
>> @@ -679,6 +681,125 @@ static phys_addr_t arm_lpae_iova_to_phys(struct 
>> io_pgtable_ops *ops,
>>   return iopte_to_paddr(pte, data) | iova;
>>   }
>>   +static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data,
>> +   

Re: [RFC PATCH 04/11] iommu/arm-smmu-v3: Split block descriptor to a span of page

2021-02-04 Thread Robin Murphy

On 2021-01-28 15:17, Keqian Zhu wrote:

From: jiangkunkun 

Block descriptor is not a proper granule for dirty log tracking. This
adds a new interface named split_block in iommu layer and arm smmuv3
implements it, which splits block descriptor to an equivalent span of
page descriptors.

During spliting block, other interfaces are not expected to be working,
so race condition does not exist. And we flush all iotlbs after the split
procedure is completed to ease the pressure of iommu, as we will split a
huge range of block mappings in general.


"Not expected to be" is not the same thing as "can not". Presumably the 
whole point of dirty log tracking is that it can be run speculatively in 
the background, so is there any actual guarantee that the guest can't, 
say, issue a hotplug event that would cause some memory to be released 
back to the host and unmapped while a scan might be in progress? Saying 
effectively "there is no race condition as long as you assume there is 
no race condition" isn't all that reassuring...


That said, it's not very clear why patches #4 and #5 are here at all, 
given that patches #6 and #7 appear quite happy to handle block entries.



Co-developed-by: Keqian Zhu 
Signed-off-by: Kunkun Jiang 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  20 
  drivers/iommu/io-pgtable-arm.c  | 122 
  drivers/iommu/iommu.c   |  40 +++
  include/linux/io-pgtable.h  |   2 +
  include/linux/iommu.h   |  10 ++
  5 files changed, 194 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 9208881a571c..5469f4fca820 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2510,6 +2510,25 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
return ret;
  }
  
+static size_t arm_smmu_split_block(struct iommu_domain *domain,

+  unsigned long iova, size_t size)
+{
+   struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+   struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+   if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) {
+   dev_err(smmu->dev, "don't support BBML1/2 and split block\n");
+   return 0;
+   }
+
+   if (!ops || !ops->split_block) {
+   pr_err("don't support split block\n");
+   return 0;
+   }
+
+   return ops->split_block(ops, iova, size);
+}
+
  static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
  {
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2609,6 +2628,7 @@ static struct iommu_ops arm_smmu_ops = {
.device_group   = arm_smmu_device_group,
.domain_get_attr= arm_smmu_domain_get_attr,
.domain_set_attr= arm_smmu_domain_set_attr,
+   .split_block= arm_smmu_split_block,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
.put_resv_regions   = generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index e299a44808ae..f3b7f7115e38 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -79,6 +79,8 @@
  #define ARM_LPAE_PTE_SH_IS(((arm_lpae_iopte)3) << 8)
  #define ARM_LPAE_PTE_NS   (((arm_lpae_iopte)1) << 5)
  #define ARM_LPAE_PTE_VALID(((arm_lpae_iopte)1) << 0)
+/* Block descriptor bits */
+#define ARM_LPAE_PTE_NT(((arm_lpae_iopte)1) << 16)
  
  #define ARM_LPAE_PTE_ATTR_LO_MASK	(((arm_lpae_iopte)0x3ff) << 2)

  /* Ignore the contiguous bit for block splitting */
@@ -679,6 +681,125 @@ static phys_addr_t arm_lpae_iova_to_phys(struct 
io_pgtable_ops *ops,
return iopte_to_paddr(pte, data) | iova;
  }
  
+static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data,

+unsigned long iova, size_t size, int lvl,
+arm_lpae_iopte *ptep);
+
+static size_t arm_lpae_do_split_blk(struct arm_lpae_io_pgtable *data,
+   unsigned long iova, size_t size,
+   arm_lpae_iopte blk_pte, int lvl,
+   arm_lpae_iopte *ptep)
+{
+   struct io_pgtable_cfg *cfg = >iop.cfg;
+   arm_lpae_iopte pte, *tablep;
+   phys_addr_t blk_paddr;
+   size_t tablesz = ARM_LPAE_GRANULE(data);
+   size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+   int i;
+
+   if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+   return 0;
+
+   tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg);
+   if (!tablep)
+   return 0;
+
+   blk_paddr = iopte_to_paddr(blk_pte, data);
+   

[RFC PATCH 04/11] iommu/arm-smmu-v3: Split block descriptor to a span of page

2021-01-28 Thread Keqian Zhu
From: jiangkunkun 

Block descriptor is not a proper granule for dirty log tracking. This
adds a new interface named split_block in iommu layer and arm smmuv3
implements it, which splits block descriptor to an equivalent span of
page descriptors.

During spliting block, other interfaces are not expected to be working,
so race condition does not exist. And we flush all iotlbs after the split
procedure is completed to ease the pressure of iommu, as we will split a
huge range of block mappings in general.

Co-developed-by: Keqian Zhu 
Signed-off-by: Kunkun Jiang 
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  20 
 drivers/iommu/io-pgtable-arm.c  | 122 
 drivers/iommu/iommu.c   |  40 +++
 include/linux/io-pgtable.h  |   2 +
 include/linux/iommu.h   |  10 ++
 5 files changed, 194 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 9208881a571c..5469f4fca820 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2510,6 +2510,25 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
return ret;
 }
 
+static size_t arm_smmu_split_block(struct iommu_domain *domain,
+  unsigned long iova, size_t size)
+{
+   struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+   struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+   if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) {
+   dev_err(smmu->dev, "don't support BBML1/2 and split block\n");
+   return 0;
+   }
+
+   if (!ops || !ops->split_block) {
+   pr_err("don't support split block\n");
+   return 0;
+   }
+
+   return ops->split_block(ops, iova, size);
+}
+
 static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2609,6 +2628,7 @@ static struct iommu_ops arm_smmu_ops = {
.device_group   = arm_smmu_device_group,
.domain_get_attr= arm_smmu_domain_get_attr,
.domain_set_attr= arm_smmu_domain_set_attr,
+   .split_block= arm_smmu_split_block,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
.put_resv_regions   = generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index e299a44808ae..f3b7f7115e38 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -79,6 +79,8 @@
 #define ARM_LPAE_PTE_SH_IS (((arm_lpae_iopte)3) << 8)
 #define ARM_LPAE_PTE_NS(((arm_lpae_iopte)1) << 5)
 #define ARM_LPAE_PTE_VALID (((arm_lpae_iopte)1) << 0)
+/* Block descriptor bits */
+#define ARM_LPAE_PTE_NT(((arm_lpae_iopte)1) << 16)
 
 #define ARM_LPAE_PTE_ATTR_LO_MASK  (((arm_lpae_iopte)0x3ff) << 2)
 /* Ignore the contiguous bit for block splitting */
@@ -679,6 +681,125 @@ static phys_addr_t arm_lpae_iova_to_phys(struct 
io_pgtable_ops *ops,
return iopte_to_paddr(pte, data) | iova;
 }
 
+static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data,
+unsigned long iova, size_t size, int lvl,
+arm_lpae_iopte *ptep);
+
+static size_t arm_lpae_do_split_blk(struct arm_lpae_io_pgtable *data,
+   unsigned long iova, size_t size,
+   arm_lpae_iopte blk_pte, int lvl,
+   arm_lpae_iopte *ptep)
+{
+   struct io_pgtable_cfg *cfg = >iop.cfg;
+   arm_lpae_iopte pte, *tablep;
+   phys_addr_t blk_paddr;
+   size_t tablesz = ARM_LPAE_GRANULE(data);
+   size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+   int i;
+
+   if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+   return 0;
+
+   tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg);
+   if (!tablep)
+   return 0;
+
+   blk_paddr = iopte_to_paddr(blk_pte, data);
+   pte = iopte_prot(blk_pte);
+   for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz)
+   __arm_lpae_init_pte(data, blk_paddr, pte, lvl, [i]);
+
+   if (cfg->bbml == 1) {
+   /* Race does not exist */
+   blk_pte |= ARM_LPAE_PTE_NT;
+   __arm_lpae_set_pte(ptep, blk_pte, cfg);
+   io_pgtable_tlb_flush_walk(>iop, iova, size, size);
+   }
+   /* Race does not exist */
+   pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg);
+
+   /* Have splited it into page? */
+   if (lvl == (ARM_LPAE_MAX_LEVELS - 1))
+   return size;
+
+   /* Go back to lvl - 1 */
+   ptep