Re: [Qemu-devel] [PATCH v3 6/6] vmdk: Update metadata for multiple clusters

Ashijeet Acharya Fri, 21 Apr 2017 21:14:30 -0700

On Fri, Apr 21, 2017 at 1:45 PM, Fam Zheng <f...@redhat.com> wrote:
> On Sat, 04/01 20:14, Ashijeet Acharya wrote:
>> Include a next pointer in VmdkMetaData struct to point to the previous
>> allocated L2 table. Modify vmdk_L2update to start updating metadata for
>> allocation of multiple clusters at once.
>>
>> Signed-off-by: Ashijeet Acharya <ashijeetacha...@gmail.com>
>
> This is the metadata part of the coalesed allocation. I think patch 3 is
> functionally incomplete without these changes, and is perhaps broken because
> metadata is not handled correctly.
>
> Such an "intermediate functional regression" is not good in a series, which we
> need to avoid.


I have moved this patch right after patch 3 because merging both will
result in an unnecessary huge patch. Will that work?
>
>> ---
>>  block/vmdk.c | 136 
>> ++++++++++++++++++++++++++++++++++++++++++++++++-----------
>>  1 file changed, 111 insertions(+), 25 deletions(-)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index 9456ddd..c7675db 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
>>      int valid;
>>      uint32_t *l2_cache_entry;
>>      uint32_t nb_clusters;
>> +    uint32_t offset;
>> +    struct VmdkMetaData *next;
>>  } VmdkMetaData;
>>
>>  typedef struct VmdkGrainMarker {
>> @@ -263,6 +265,12 @@ static inline uint64_t size_to_clusters(VmdkExtent 
>> *extent, uint64_t size)
>>      return (DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) - 
>> 1);
>>  }
>>
>> +static inline int64_t vmdk_align_offset(int64_t offset, int n)
>> +{
>> +    offset = (offset + n - 1) & ~(n - 1);
>> +    return offset;
>> +}
>> +
>>  static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
>>  {
>>      char *desc;
>> @@ -1037,29 +1045,88 @@ static void vmdk_refresh_limits(BlockDriverState 
>> *bs, Error **errp)
>>      }
>>  }
>>
>> -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
>> -                         uint32_t offset)
>> +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
>> +                                      VmdkMetaData *m_data, bool zeroed)
>>  {
>> -    offset = cpu_to_le32(offset);
>> +    int i;
>> +    uint32_t offset, temp_offset;
>> +    int *l2_table_array;
>> +    int l2_array_size;
>> +
>> +    if (zeroed) {
>> +        temp_offset = VMDK_GTE_ZEROED;
>> +    } else {
>> +        temp_offset = m_data->offset;
>> +    }
>> +
>> +    temp_offset = cpu_to_le32(temp_offset);
>> +
>> +    l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
>> +    l2_table_array = qemu_try_blockalign(extent->file->bs,
>> +                                    vmdk_align_offset(l2_array_size, 512));
>
> Indentation is off.
>
> Use QEMU_ALIGN_UP, instead of vmdk_align_offset.
>
> 512 is a magic number, use BDRV_SECTOR_SIZE.

Done

>
>> +    if (l2_table_array == NULL) {
>> +        return VMDK_ERROR;
>> +    }
>> +    memset(l2_table_array, 0, vmdk_align_offset(l2_array_size, 512));
>> +
>>      /* update L2 table */
>> +    offset = temp_offset;
>> +    for (i = 0; i < m_data->nb_clusters; i++) {
>> +        l2_table_array[i] = offset;
>> +        if (!zeroed) {
>> +            offset += 128;
>
> Something is going wrong here with endianness on BE host, I believe.

I have changed temp_offset to LE above, wouldn't that be enough. I am not sure.

>
>> +        }
>> +    }
>> +
>>      if (bdrv_pwrite_sync(extent->file,
>> -                ((int64_t)m_data->l2_offset * 512)
>> -                    + (m_data->l2_index * sizeof(offset)),
>> -                &offset, sizeof(offset)) < 0) {
>> +            ((int64_t)m_data->l2_offset * 512)
>> +                + ((m_data->l2_index) * sizeof(offset)),
>> +                         l2_table_array, l2_array_size) < 0) {
>
> You can fix the indentation while changing these lines. If not, don't change 
> it,
> or at least don't make it uglier.

I have aligned it, if it still looks ugly in v4, I will revert.

>
>>          return VMDK_ERROR;
>>      }
>> +
>>      /* update backup L2 table */
>>      if (extent->l1_backup_table_offset != 0) {
>>          m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
>>          if (bdrv_pwrite_sync(extent->file,
>>                      ((int64_t)m_data->l2_offset * 512)
>> -                        + (m_data->l2_index * sizeof(offset)),
>> -                    &offset, sizeof(offset)) < 0) {
>> +                        + ((m_data->l2_index) * sizeof(offset)),
>> +                                l2_table_array, l2_array_size) < 0) {
>
> Same here.
>
>>              return VMDK_ERROR;
>>          }
>>      }
>> +
>> +    offset = temp_offset;
>>      if (m_data->l2_cache_entry) {
>> -        *m_data->l2_cache_entry = offset;
>> +        for (i = 0; i < m_data->nb_clusters; i++) {
>> +            *m_data->l2_cache_entry = offset;
>> +            m_data->l2_cache_entry++;
>> +
>> +            if (!zeroed) {
>> +                offset += 128;
>> +            }
>> +        }
>> +    }
>> +
>> +    qemu_vfree(l2_table_array);
>> +    return VMDK_OK;
>> +}
>> +
>> +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
>> +                         bool zeroed)
>> +{
>> +    int ret;
>> +
>> +    while (m_data->next != NULL) {
>> +        VmdkMetaData *next;
>> +
>> +        ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        next = m_data->next;
>> +        m_data = next;
>
> Why not simply "m_data = m_data->next" and drop "next" variable?
>>      }
>>
>>      return VMDK_OK;
>> @@ -1271,7 +1338,7 @@ exit:
>>   */
>>  static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
>>                          uint64_t offset, uint64_t *cluster_offset,
>> -                        int64_t *bytes, VmdkMetaData *m_data,
>> +                        int64_t *bytes, VmdkMetaData **m_data,
>>                          bool allocate, uint32_t *total_alloc_clusters)
>>  {
>>      int l1_index, l2_offset, l2_index;
>> @@ -1280,6 +1347,7 @@ static int handle_alloc(BlockDriverState *bs, 
>> VmdkExtent *extent,
>>      uint32_t nb_clusters;
>>      bool zeroed = false;
>>      uint64_t skip_start_bytes, skip_end_bytes;
>> +    VmdkMetaData *old_m_data;
>>      int ret;
>>
>>      ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
>> @@ -1330,13 +1398,21 @@ static int handle_alloc(BlockDriverState *bs, 
>> VmdkExtent *extent,
>>          if (ret < 0) {
>>              return ret;
>>          }
>> -        if (m_data) {
>> -            m_data->valid = 1;
>> -            m_data->l1_index = l1_index;
>> -            m_data->l2_index = l2_index;
>> -            m_data->l2_offset = l2_offset;
>> -            m_data->l2_cache_entry = &l2_table[l2_index];
>> -            m_data->nb_clusters = nb_clusters;
>> +
>> +        if (*m_data) {
>> +            old_m_data = *m_data;
>> +            *m_data = g_malloc0(sizeof(**m_data));
>> +
>> +            **m_data = (VmdkMetaData) {
>> +                .valid            =    1,
>> +                .l1_index         =    l1_index,
>> +                .l2_index         =    l2_index,
>> +                .l2_offset        =    l2_offset,
>> +                .l2_cache_entry   =    &l2_table[l2_index],
>> +                .nb_clusters      =    nb_clusters,
>> +                .offset           =    cluster_sector,
>> +                .next             =    old_m_data,
>> +            };
>
> I think if the new m_data can be merged into the old, there is no need to
> allocate a new one.

Do you mean that if the clusters lie in the same l2 table, then merge
them? I think this case only appears when I leave out the first and
last cluster for COW. If I misunderstood, sorry!
I think I will post v4 without attending this issue and we can discuss
this when you are available after the weekend.

>
>>          }
>>      }
>>      *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
>> @@ -1365,7 +1441,7 @@ static int handle_alloc(BlockDriverState *bs, 
>> VmdkExtent *extent,
>>   */
>>  static int vmdk_alloc_cluster_offset(BlockDriverState *bs,
>>                                       VmdkExtent *extent,
>> -                                     VmdkMetaData *m_data, uint64_t offset,
>> +                                     VmdkMetaData **m_data, uint64_t offset,
>>                                       bool allocate, uint64_t 
>> *cluster_offset,
>>                                       int64_t bytes,
>>                                       uint32_t *total_alloc_clusters)
>> @@ -1385,8 +1461,8 @@ static int vmdk_alloc_cluster_offset(BlockDriverState 
>> *bs,
>>      new_cluster_offset = 0;
>>      *cluster_offset = 0;
>>      n_bytes = 0;
>> -    if (m_data) {
>> -        m_data->valid = 0;
>> +    if (*m_data) {
>> +        (*m_data)->valid = 0;
>>      }
>>
>>      /* due to L2 table margins all bytes may not get allocated at once */
>> @@ -1768,9 +1844,11 @@ static int vmdk_pwritev(BlockDriverState *bs, 
>> uint64_t offset,
>>      uint64_t cluster_offset;
>>      uint64_t bytes_done = 0;
>>      uint64_t extent_size;
>> -    VmdkMetaData m_data;
>> +    VmdkMetaData *m_data;
>>      uint32_t total_alloc_clusters = 0;
>>
>> +    m_data = g_malloc0(sizeof(*m_data));
>> +
[scroll till here] [1] So this allocation will need to move....[2]

>>      if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
>>          error_report("Wrong offset: offset=0x%" PRIx64
>>                       " total_sectors=0x%" PRIx64,
>> @@ -1779,6 +1857,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
>> offset,
>>      }
>>
>>      while (bytes > 0) {

....[2] here. Thus we will need to allocate it again every time we
enter here otherwise the very next line m_data->next=NULL will
segfault.

So maybe its good to free it separately?
I will retain it this way for v4 and change it otherwise if you still
say so after my reasoning in v5.

>> +        m_data->next = NULL;
>>          extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
>>          if (!extent) {
>>              return -EIO;
>> @@ -1825,7 +1904,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
>> offset,
>>                                          total_alloc_clusters;
>>                  if (!zero_dry_run) {
>>                      /* update L2 tables */
>> -                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
>> +                    if (vmdk_L2update(extent, m_data, zeroed)
>>                              != VMDK_OK) {
>>                          return -EIO;
>>                      }
>> @@ -1839,10 +1918,9 @@ static int vmdk_pwritev(BlockDriverState *bs, 
>> uint64_t offset,
>>              if (ret) {
>>                  return ret;
>>              }
>> -            if (m_data.valid) {
>> +            if (m_data->valid) {
>>                  /* update L2 tables */
>> -                if (vmdk_L2update(extent, &m_data,
>> -                                  cluster_offset >> BDRV_SECTOR_BITS)
>> +                if (vmdk_L2update(extent, m_data, zeroed)
>>                          != VMDK_OK) {
>>                      return -EIO;
>>                  }
>> @@ -1852,6 +1930,13 @@ static int vmdk_pwritev(BlockDriverState *bs, 
>> uint64_t offset,
>>          offset += n_bytes;
>>          bytes_done += n_bytes;
>>
>> +        while (m_data->next != NULL) {
>> +            VmdkMetaData *next;
>> +            next = m_data->next;
>> +            g_free(m_data);
>> +            m_data = next;
>> +        }
>> +
>>          /* update CID on the first write every time the virtual disk is
>>           * opened */
>>          if (!s->cid_updated) {
>> @@ -1862,6 +1947,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
>> offset,
>>              s->cid_updated = true;
>>          }
>>      }
>> +    g_free(m_data);
>
> This is weird, you free all but the last m_data with a while loop, a few lines
> above, and this one with a separate g_free().
>
> Please use one loop:
>
>     for (p = m_data; p; p = next) {
>         next = p->next;
>         g_free(p);
>     }

I have a good (maybe good enough) reason for it, if I free it in the
while loop above, then I will need to allocate it again when we enter
the superior while(bytes>0) loop, otherwise we will segfault for
everything from that point onwards....[scroll up __^] [1]

Ashijeet

Re: [Qemu-devel] [PATCH v3 6/6] vmdk: Update metadata for multiple clusters

Reply via email to