Attached is a sequence of patches that refactor the handling of variable sized
integers used in the serialisation code.

I *think* that it makes sense to inline the various static functions into their
callers, as unlike the seemingly analogous fixed-sized integer code, the
various varint static functions are each only used by one function.
(Certainly, this is true after a small amount of refactoring)

Net effect of the patches is:

 src/6model/serialization.c | 186 +++++++++++++++++++++------------------------
 1 file changed, 85 insertions(+), 101 deletions(-)

so there's certainly less code, and at least to my eye it's simpler too.

Also attached is another test for NQP for serialisation.

Feedback and other opinions welcome.

Nicholas Clark
>From 27820d14304fd94f47158e1de0228779e2bfea01 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 20:52:59 +0100
Subject: [PATCH 1/9] Remove redundant call to varintsize() from
 write_array_int().

There's no need for write_array_int() to call varintsize() or
expand_storage_if_needed() because the call it makes to
write_varint_func() will do these checks as well.
---
 src/6model/serialization.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 380d1b9..bd10f28 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -422,12 +422,8 @@ static void write_array_int(MVMThreadContext *tc, MVMSerializationWriter *writer
 static void write_array_varint(MVMThreadContext *tc, MVMSerializationWriter *writer, MVMObject *arr) {
     MVMint32 elems = (MVMint32)MVM_repr_elems(tc, arr);
     MVMint32 i;
-    size_t storage_needed;
-
-    storage_needed = varintsize(elems);
 
     /* Write out element count. */
-    expand_storage_if_needed(tc, writer, storage_needed);
     write_varint_func(tc, writer, elems);
 
     /* Write elements. */
-- 
1.8.4.2

>From 3536fe50fdd1bbc9642b3f2b48d50a1d1f987962 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 21:08:52 +0100
Subject: [PATCH 2/9] A slightly simpler implementation of write_varint9().

---
 src/6model/serialization.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index bd10f28..45b95a6 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -223,17 +223,17 @@ static size_t varintsize(int64_t value) {
 }
 
 static size_t write_varint9(MVMuint8 *buffer, size_t offset, int64_t value) {
-    // do we hvae to compare < or <= ?
-    size_t position;
     size_t needed_bytes = varintsize(value);
-    for (position = 0; position < needed_bytes && position != 8; position++) {
-        buffer[offset + position] = value & 0x7F;
-        if (position != needed_bytes - 1) buffer[offset + position] = buffer[offset + position] | 0x80;
+    size_t count = needed_bytes;
+
+    while (--count) {
+        buffer[offset++] = value & 0x7F | 0x80;
         value = value >> 7;
     }
     if (needed_bytes == 9) {
-        assert(position == 8);
-        buffer[offset + position] = value;
+        buffer[offset] = value;
+    } else {
+        buffer[offset] = value & 0x7F;
     }
     return needed_bytes;
 }
-- 
1.8.4.2

>From 9355d18d96fc79eadf791448313aadd21fbbf713 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 21:17:16 +0100
Subject: [PATCH 3/9] Inline write_varint9() into its only caller,
 write_varint_func().

Update the comment describing write_varint_func().
---
 src/6model/serialization.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 45b95a6..0bdc435 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -222,22 +222,6 @@ static size_t varintsize(int64_t value) {
     return 9;
 }
 
-static size_t write_varint9(MVMuint8 *buffer, size_t offset, int64_t value) {
-    size_t needed_bytes = varintsize(value);
-    size_t count = needed_bytes;
-
-    while (--count) {
-        buffer[offset++] = value & 0x7F | 0x80;
-        value = value >> 7;
-    }
-    if (needed_bytes == 9) {
-        buffer[offset] = value;
-    } else {
-        buffer[offset] = value & 0x7F;
-    }
-    return needed_bytes;
-}
-
 #define STRING_IS_NULL(s) ((s) == NULL)
 
 /* Adds an item to the MVMString heap if needed, and returns the index where
@@ -330,12 +314,29 @@ static void write_int_func(MVMThreadContext *tc, MVMSerializationWriter *writer,
     *(writer->cur_write_offset) += 8;
 }
 
-/* Writing function for varint9 */
+/* Writing function for variable sized integers. Writes out a 64 bit value
+   using between 1 and 9 bytes. */
 static void write_varint_func(MVMThreadContext *tc, MVMSerializationWriter *writer, MVMint64 value) {
     size_t storage_needed = varintsize(value);
-    size_t actually_written;
+    size_t count = storage_needed;
+    char *buffer;
+    size_t offset;
+
     expand_storage_if_needed(tc, writer, storage_needed);
-    actually_written = write_varint9(*(writer->cur_write_buffer), *(writer->cur_write_offset), value);
+
+    buffer = *(writer->cur_write_buffer);
+    offset = *(writer->cur_write_offset);
+
+    while (--count) {
+        buffer[offset++] = value & 0x7F | 0x80;
+        value = value >> 7;
+    }
+    if (storage_needed == 9) {
+        buffer[offset] = value;
+    } else {
+        buffer[offset] = value & 0x7F;
+    }
+
     *(writer->cur_write_offset) += storage_needed;
 }
 
-- 
1.8.4.2

>From 006f1508445082f4848613917df2c625fc5cb5cc Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 21:23:19 +0100
Subject: [PATCH 4/9] Move varintsize() nearer to its only caller.

Change its return value from size_t to int.
---
 src/6model/serialization.c | 51 +++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 0bdc435..11adb65 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -198,30 +198,6 @@ static void write_double(char *buffer, size_t offset, double value) {
 #endif
 }
 
-/* Writes an int64 into up to 128 bits of storage.
- * Returns how far to advance the offset. */
-static size_t varintsize(int64_t value) {
-    if(value < 0)
-        value = -value - 1;
-    if(value < 64) /* 7 bits */
-        return 1;
-    if(value < 8192) /* 14 bits */
-        return 2;
-    if(value < 1048576) /* 21 bits */
-        return 3;
-    if(value < 134217728) /* 28 bits */
-        return 4;
-    if(value < 17179869184LL) /* 35 bits */
-        return 5;
-    if(value < 2199023255552LL) /* 42 bits */
-        return 6;
-    if(value < 281474976710656LL) /* 49 bits */
-        return 7;
-    if(value < 36028797018963968LL) /* 56 bits */
-        return 8;
-    return 9;
-}
-
 #define STRING_IS_NULL(s) ((s) == NULL)
 
 /* Adds an item to the MVMString heap if needed, and returns the index where
@@ -314,11 +290,34 @@ static void write_int_func(MVMThreadContext *tc, MVMSerializationWriter *writer,
     *(writer->cur_write_offset) += 8;
 }
 
+/* Size of the variable length encoding for a given value. */
+static int varintsize(int64_t value) {
+    if(value < 0)
+        value = -value - 1;
+    if(value < 64) /* 7 bits */
+        return 1;
+    if(value < 8192) /* 14 bits */
+        return 2;
+    if(value < 1048576) /* 21 bits */
+        return 3;
+    if(value < 134217728) /* 28 bits */
+        return 4;
+    if(value < 17179869184LL) /* 35 bits */
+        return 5;
+    if(value < 2199023255552LL) /* 42 bits */
+        return 6;
+    if(value < 281474976710656LL) /* 49 bits */
+        return 7;
+    if(value < 36028797018963968LL) /* 56 bits */
+        return 8;
+    return 9;
+}
+
 /* Writing function for variable sized integers. Writes out a 64 bit value
    using between 1 and 9 bytes. */
 static void write_varint_func(MVMThreadContext *tc, MVMSerializationWriter *writer, MVMint64 value) {
-    size_t storage_needed = varintsize(value);
-    size_t count = storage_needed;
+    const int storage_needed = varintsize(value);
+    int count = storage_needed;
     char *buffer;
     size_t offset;
 
-- 
1.8.4.2

>From a44923766053c039baedaa45d6859600d0bb8919 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 21:55:03 +0100
Subject: [PATCH 5/9] In read_array_varint(), use read_varint_func() in place
 of two other calls.

This mirrors write_array_varint(), which calls write_varint_func().
---
 src/6model/serialization.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 11adb65..86c1d57 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -1438,9 +1438,7 @@ static MVMObject * read_array_varint(MVMThreadContext *tc, MVMSerializationReade
     size_t header_size;
 
     /* Read the element count. */
-    assert_can_read_varint(tc, reader);
-    header_size = read_varint9(*(reader->cur_read_buffer), *(reader->cur_read_offset), &elems);
-    *(reader->cur_read_offset) += header_size;
+    elems = read_varint_func(tc, reader);
 
     /* Read in the elements. */
     for (i = 0; i < elems; i++)
-- 
1.8.4.2

>From c41c074735a72e66c4ab25e68b43006539819c2a Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 22:12:43 +0100
Subject: [PATCH 6/9] A simpler implementation of read_varint9().

---
 src/6model/serialization.c | 50 +++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 86c1d57..7381014 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -1201,37 +1201,33 @@ static MVMnum64 read_double(char *buffer, size_t offset) {
     return value;
 }
 
-/* Reads an int64 from up to 128bits of storage.
+/* Reads an int64 from up to 9 bytes of storage.
  * Returns how far to advance the offset. */
-static size_t read_varint9(MVMuint8 *buffer, size_t offset, int64_t *value) {
-    size_t inner_offset = 0;
-    size_t shift_amount = 0;
-    int64_t negation_mask = 0;
-    int read_on = !!(buffer[offset] & 0x80) + 1;
-    *value = 0;
-    while (read_on && inner_offset != 8) {
-        *value = *value | ((int64_t)(buffer[offset + inner_offset] & 0x7F) << shift_amount);
-        negation_mask = negation_mask | ((int64_t)0x7F << shift_amount);
-        if (read_on == 1 && buffer[offset + inner_offset] & 0x80) {
-            read_on = 2;
-        }
-        read_on--;
-        inner_offset++;
+static size_t read_varint9(MVMuint8 *buffer, size_t offset, int64_t *value_p) {
+    MVMuint8 *const start = buffer + offset;
+    MVMuint8 *const ninth = start + 8;
+    MVMuint8 *p = start;
+    int64_t value = 0;
+    int shift_amount = 0;
+
+    while (*p & 0x80 && p < ninth) {
+        value |= ((int64_t)(*p & 0x7F) << shift_amount);
         shift_amount += 7;
+        ++p;
     }
-    // our last byte will be a full byte, so that we reach the full 64 bits
-    if (read_on && inner_offset == 8) {
-        *value = *value | ((int64_t)buffer[offset + inner_offset] << shift_amount);
-        negation_mask = negation_mask | ((int64_t)0xFF << shift_amount);
-        ++inner_offset;
-    }
-    negation_mask = negation_mask >> 1;
-    // do we have a negative number so far?
-    if (*value & ~negation_mask) {
-        // we have to fill it up with ones all the way to the left.
-        *value = *value | ~negation_mask;
+    if (p == ninth) {
+        /* our last byte will be a full byte, so that we reach the full 64 bits
+           As we have the full 64 bits, no need to sign extend. */
+        value |= ((int64_t)(*p) << shift_amount);
+    } else {
+        value |= ((int64_t)(*p & 0x7F) << shift_amount);
+        /* Now sign extend the highest bit that we read. */
+        value = value << (57 - shift_amount);
+        value = value >> (57 - shift_amount);
     }
-    return inner_offset;
+    *value_p = value;
+
+    return p + 1 - start;
 }
 
 /* If deserialization should fail, cleans up before throwing an exception. */
-- 
1.8.4.2

>From c27a446018fe3b15a163688faba14ffa06c7e7f9 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 22:23:27 +0100
Subject: [PATCH 7/9] Inline read_varint9() into its only caller,
 read_varint_func().

---
 src/6model/serialization.c | 59 ++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 7381014..9627e3c 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -1201,35 +1201,6 @@ static MVMnum64 read_double(char *buffer, size_t offset) {
     return value;
 }
 
-/* Reads an int64 from up to 9 bytes of storage.
- * Returns how far to advance the offset. */
-static size_t read_varint9(MVMuint8 *buffer, size_t offset, int64_t *value_p) {
-    MVMuint8 *const start = buffer + offset;
-    MVMuint8 *const ninth = start + 8;
-    MVMuint8 *p = start;
-    int64_t value = 0;
-    int shift_amount = 0;
-
-    while (*p & 0x80 && p < ninth) {
-        value |= ((int64_t)(*p & 0x7F) << shift_amount);
-        shift_amount += 7;
-        ++p;
-    }
-    if (p == ninth) {
-        /* our last byte will be a full byte, so that we reach the full 64 bits
-           As we have the full 64 bits, no need to sign extend. */
-        value |= ((int64_t)(*p) << shift_amount);
-    } else {
-        value |= ((int64_t)(*p & 0x7F) << shift_amount);
-        /* Now sign extend the highest bit that we read. */
-        value = value << (57 - shift_amount);
-        value = value >> (57 - shift_amount);
-    }
-    *value_p = value;
-
-    return p + 1 - start;
-}
-
 /* If deserialization should fail, cleans up before throwing an exception. */
 MVM_NO_RETURN
 static void fail_deserialize(MVMThreadContext *tc, MVMSerializationReader *reader,
@@ -1305,13 +1276,33 @@ static int assert_can_read_varint(MVMThreadContext *tc, MVMSerializationReader *
     return 1;
 }
 
-/* Reading function for variable-sized integers */
+/* Reading function for variable-sized integers, using between 1 and 9 bytes of
+   storage for an int64. */
 static MVMint64 read_varint_func(MVMThreadContext *tc, MVMSerializationReader *reader) {
-    MVMint64 result;
-    size_t length;
+    MVMint64 result = 0;
+    MVMuint8 *const start = (MVMuint8 *) *(reader->cur_read_buffer) + *(reader->cur_read_offset);
+    MVMuint8 *const ninth = start + 8;
+    MVMuint8 *p = start;
+    int shift_amount = 0;
     assert_can_read_varint(tc, reader);
-    length = read_varint9(*(reader->cur_read_buffer), *(reader->cur_read_offset), &result);
-    *(reader->cur_read_offset) += length;
+
+    while (*p & 0x80 && p < ninth) {
+        result |= ((MVMint64)(*p & 0x7F) << shift_amount);
+        shift_amount += 7;
+        ++p;
+    }
+    if (p == ninth) {
+        /* our last byte will be a full byte, so that we reach the full 64 bits
+           As we have the full 64 bits, no need to sign extend. */
+        result |= ((MVMint64)(*p) << shift_amount);
+    } else {
+        result |= ((MVMint64)(*p & 0x7F) << shift_amount);
+        /* Now sign extend the highest bit that we read. */
+        result = result << (57 - shift_amount);
+        result = result >> (57 - shift_amount);
+    }
+
+    *(reader->cur_read_offset) += p + 1 - start;
     return result;
 }
 
-- 
1.8.4.2

>From a501cb1146b019daafeffe68bf82eae364934085 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Wed, 12 Feb 2014 10:03:52 +0100
Subject: [PATCH 8/9] Inline assert_can_read_varint() into read_varint_func()

To work at all, assert_can_read_varint() had to duplicate the variable length
decode logic. Really we only want that logic in one place, so it's simpler to
move the buffer bounds check inside the reader function itself.
---
 src/6model/serialization.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 9627e3c..588ceaf 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -1265,31 +1265,31 @@ static MVMint64 read_int_func(MVMThreadContext *tc, MVMSerializationReader *read
     return result;
 }
 
-static int assert_can_read_varint(MVMThreadContext *tc, MVMSerializationReader *reader) {
-    size_t length_so_far = 1;
-    assert_can_read(tc, reader, 1);
-    while (length_so_far <= 8 && ((*reader->cur_read_buffer)[*reader->cur_read_offset + length_so_far - 1] & 0x80))
-        assert_can_read(tc, reader, ++length_so_far);
-    if (length_so_far > 9) {
-        return 0;
-    }
-    return 1;
-}
-
 /* Reading function for variable-sized integers, using between 1 and 9 bytes of
    storage for an int64. */
 static MVMint64 read_varint_func(MVMThreadContext *tc, MVMSerializationReader *reader) {
     MVMint64 result = 0;
     MVMuint8 *const start = (MVMuint8 *) *(reader->cur_read_buffer) + *(reader->cur_read_offset);
+    MVMuint8 *const read_end = (MVMuint8 *) *(reader->cur_read_end);
     MVMuint8 *const ninth = start + 8;
     MVMuint8 *p = start;
     int shift_amount = 0;
-    assert_can_read_varint(tc, reader);
+
+    /* We can't know how many bytes we need to read without actually reading
+       them, so it's easiest to inline the over-the-end test into the loop.
+       read_end is exclusive - it's the address of the first thing we can't
+       read. Therefore if we're about to read at it, we're too far. */
+    if (p == read_end)
+        fail_deserialize(tc, reader,
+                         "Read past end of serialization data buffer");
 
     while (*p & 0x80 && p < ninth) {
         result |= ((MVMint64)(*p & 0x7F) << shift_amount);
         shift_amount += 7;
         ++p;
+        if (p == read_end)
+            fail_deserialize(tc, reader,
+                             "Read past end of serialization data buffer");
     }
     if (p == ninth) {
         /* our last byte will be a full byte, so that we reach the full 64 bits
-- 
1.8.4.2

>From 0d24b64f9311518c776cd405e720d04b4f0c0ad0 Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Wed, 12 Feb 2014 10:19:18 +0100
Subject: [PATCH 9/9] Slight simplification possible in read_varint_func()

If the loop terminates before the ninth byte, the top bit of the last byte read
must be clear. So there's no need to mask it before merging it.
---
 src/6model/serialization.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/6model/serialization.c b/src/6model/serialization.c
index 588ceaf..90f2c20 100644
--- a/src/6model/serialization.c
+++ b/src/6model/serialization.c
@@ -1296,7 +1296,10 @@ static MVMint64 read_varint_func(MVMThreadContext *tc, MVMSerializationReader *r
            As we have the full 64 bits, no need to sign extend. */
         result |= ((MVMint64)(*p) << shift_amount);
     } else {
-        result |= ((MVMint64)(*p & 0x7F) << shift_amount);
+        /* As the loop above terminated before the ninth byte, the top bit must
+           be clear. */
+        assert(!(*p & 0x80));
+        result |= ((MVMint64)(*p) << shift_amount);
         /* Now sign extend the highest bit that we read. */
         result = result << (57 - shift_amount);
         result = result >> (57 - shift_amount);
-- 
1.8.4.2

>From cfbd81282dec220db2e922728f696245ed5fb88f Mon Sep 17 00:00:00 2001
From: Nicholas Clark <n...@ccl4.org>
Date: Tue, 11 Feb 2014 20:47:23 +0100
Subject: [PATCH 4/4] Test serializing integers with 63 bits set and 1 bit
 clear.

These bit patterns may catch out some bugs.
---
 t/serialization/01-basic.t | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/t/serialization/01-basic.t b/t/serialization/01-basic.t
index d2c25b7..7a76394 100644
--- a/t/serialization/01-basic.t
+++ b/t/serialization/01-basic.t
@@ -1,6 +1,6 @@
 #! nqp
 
-plan(1355);
+plan(1421);
 
 sub add_to_sc($sc, $idx, $obj) {
     nqp::scsetobj($sc, $idx, $obj);
@@ -470,3 +470,14 @@ sub round_trip_int_array($seq, $desc, @a) {
 
     round_trip_int_array(70, 'special case integers', @a);
 }
+
+{
+    my @a;
+    my $i := 0;
+    while ($i < 64) {
+        nqp::push(@a, nqp::bitxor_i(-1, nqp::bitshiftl_i(1, $i)));
+        ++$i;
+    }
+
+    round_trip_int_array(71, 'integers with one zero bit', @a);
+}
-- 
1.8.4.2

Reply via email to