[kudu] 01/02: KUDU-2671 fix updating partition end keys for unbounded ranges

alexey Wed, 03 Aug 2022 12:12:10 -0700

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


commit 8df970f7a6520bb0dc0f9cc89ad7f62ab349e84d
Author: Alexey Serbin <ale...@apache.org>
AuthorDate: Fri Jul 29 19:57:01 2022 -0700

    KUDU-2671 fix updating partition end keys for unbounded ranges
    
    This patch fixes an issue with updating the hash bucket components for
    partitions' end keys in case of unbounded ranges.  I also added new test
    scenarios that allowed to spot the issue.  The newly added scenarios
    would fail if not including the fix.  In addition, I updated a few other
    test scenarios to match the current logic of updating the hash partition
    component of partition end key.
    
    The previous implementation of updating the hash components tried to
    avoid holes in the partition keyspace by carrying over iotas to next
    hash dimension index, but with the introduction of range-specific hash
    schemas it's no longer possible to do so while keeping the result
    key ranges disjoint.
    
    For example, consider the following partitioning:
      [-inf, 0) x 3 buckets x 3 buckets; [0, +inf) x 2 buckets x 2 buckets
    
    The original set of ranges looks like the following:
      [ (0, 0,  ""), (0, 0, "0") )
      [ (0, 1,  ""), (0, 1, "0") )
      [ (0, 2,  ""), (0, 2, "0") )
      [ (1, 0,  ""), (1, 0, "0") )
      [ (1, 1,  ""), (1, 1, "0") )
      [ (1, 2,  ""), (1, 2, "0") )
      [ (2, 0,  ""), (2, 0, "0") )
      [ (2, 1,  ""), (2, 1, "0") )
      [ (2, 2,  ""), (2, 2, "0") )
    
      [ (0, 0, "0"), (0, 0,  "") )
      [ (0, 1, "0"), (0, 1,  "") )
      [ (1, 0, "0"), (1, 0,  "") )
      [ (1, 1, "0"), (1, 1,  "") )
    
    The previous implementation would transform the key range
    [ (0, 1, "0"), (0, 1,  "") ) into [ (0, 1, "0"), (1, 0,  "") ), but
    that would put the key range [ (0, 2,  ""), (0, 2, "0") ) inside the
    transformed one. That would mess up the interval logic of the partition
    pruning and the client's metacache.
    
    As it turns out, the continuous keyspace was just a nice thing to have
    since the partition pruning and the client metacache work fine with
    sparse keyspaces as well.
    
    Change-Id: I3775f914a3b7cdc294a26911663c2d848f4e491b
    Reviewed-on: http://gerrit.cloudera.org:8080/18808
    Tested-by: Kudu Jenkins
    Reviewed-by: Mahesh Reddy <mre...@cloudera.com>
    Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com>
    Reviewed-by: Attila Bukor <abu...@apache.org>
---
 .../org/apache/kudu/client/TestAlterTable.java     |  10 +-
 src/kudu/client/flex_partitioning_client-test.cc   | 184 ++++++++++++++++++++-
 src/kudu/common/partition-test.cc                  |  32 ++--
 src/kudu/common/partition.cc                       | 118 ++++++++-----
 4 files changed, 279 insertions(+), 65 deletions(-)

diff --git 
a/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java 
b/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java
index f0270b446..c10b6402d 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java
@@ -253,12 +253,16 @@ public class TestAlterTable {
 
     KuduScanner scanner = client.newScannerBuilder(table)
             .setProjectedColumnNames(Lists.newArrayList("c0Key", 
"c1")).build();
+    int rowCount = 0;
     while (scanner.hasMoreRows()) {
       RowResultIterator it = scanner.nextRows();
-      assertTrue(it.hasNext());
-      RowResult rr = it.next();
-      assertEquals(rr.getInt(0), rr.getInt(1));
+      while (it.hasNext()) {
+        RowResult rr = it.next();
+        assertEquals(rr.getInt(0), rr.getInt(1));
+        ++rowCount;
+      }
     }
+    assertEquals(101, rowCount);
   }
 
   @Test
diff --git a/src/kudu/client/flex_partitioning_client-test.cc 
b/src/kudu/client/flex_partitioning_client-test.cc
index 6b42eff1b..66d504a83 100644
--- a/src/kudu/client/flex_partitioning_client-test.cc
+++ b/src/kudu/client/flex_partitioning_client-test.cc
@@ -27,14 +27,15 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "kudu/client/client.h"
 #include "kudu/client/client-test-util.h"
+#include "kudu/client/client.h"
 #include "kudu/client/scan_batch.h"
 #include "kudu/client/scan_predicate.h"
 #include "kudu/client/schema.h"
 #include "kudu/client/value.h"
 #include "kudu/client/write_op.h"
 #include "kudu/common/partial_row.h"
+#include "kudu/common/partition.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/stl_util.h"
@@ -43,14 +44,14 @@
 #include "kudu/master/master.h"
 #include "kudu/master/mini_master.h"
 #include "kudu/mini-cluster/internal_mini_cluster.h"
-#include "kudu/tablet/tablet_replica.h"
 #include "kudu/tablet/tablet.h"
+#include "kudu/tablet/tablet_replica.h"
 #include "kudu/tserver/mini_tablet_server.h"
 #include "kudu/tserver/tablet_server.h"
 #include "kudu/tserver/ts_tablet_manager.h"
-#include "kudu/util/metrics.h"
 #include "kudu/util/curl_util.h"
 #include "kudu/util/faststring.h"
+#include "kudu/util/metrics.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"
@@ -1562,6 +1563,183 @@ TEST_F(FlexPartitioningCreateTableTest, Negatives) {
   }
 }
 
+// This test scenario verifies that ListPartitions() returns correct number
+// of partitions for tables with unbounded ranges with custom hash schemas.
+// Essentially, this scenario makes sure the logic used by the code to iterate
+// over range partitions by employing PartitionPruner's  NextPartitionKey() and
+// RemovePartitionKeyRange() methods works as expected.
+TEST_F(FlexPartitioningCreateTableTest, UnboundedRangesOneDimensionHash) {
+  constexpr const char* const kTableName = "UnboundedRangesOneDimensionHash";
+
+  unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
+  table_creator->table_name(kTableName)
+      .schema(&schema_)
+      .num_replicas(1)
+      .add_hash_partitions({ kKeyColumn }, 2)
+      .set_range_partition_columns({ kKeyColumn });
+
+  // Add a range partition with the table-wide hash schema.
+  {
+    unique_ptr<KuduPartialRow> lower(schema_.NewRow());
+    unique_ptr<KuduPartialRow> upper(schema_.NewRow());
+    ASSERT_OK(upper->SetInt32(kKeyColumn, -10));
+    table_creator->add_range_partition(lower.release(), upper.release());
+  }
+
+  // Add two range partitions with custom hash schemas.
+  {
+    auto p = CreateRangePartition(-10, 10);
+    ASSERT_OK(p->add_hash_partitions({ kKeyColumn }, 5, 1));
+    table_creator->add_custom_range_partition(p.release());
+  }
+  {
+    auto p = CreateRangePartitionNoUpperBound(10);
+    ASSERT_OK(p->add_hash_partitions({ kKeyColumn }, 3, 2));
+    table_creator->add_custom_range_partition(p.release());
+  }
+
+  ASSERT_OK(table_creator->Create());
+  NO_FATALS(CheckTabletCount(kTableName, 10));  // 2 + 5 + 3 = 10
+  shared_ptr<KuduTable> table;
+  ASSERT_OK(client_->OpenTable(kTableName, &table));
+
+  vector<Partition> partitions;
+  ASSERT_OK(table->ListPartitions(&partitions));
+  ASSERT_EQ(10, partitions.size());
+
+  // Make sure it's possible to insert rows into the table for all the existing
+  // partitions.
+  ASSERT_OK(InsertTestRows(kTableName, -50, 50));
+  NO_FATALS(CheckTableRowsNum(kTableName, 100));
+}
+
+// Similar to FlexPartitioningCreateTableTest.UnboundedRangesOneDimensionHash
+// abobe, but with two hash dimensions.
+TEST_F(FlexPartitioningCreateTableTest, UnboundedRangesTwoDimensionHash) {
+  constexpr const char* const kTableName = "UnboundedRangesTwoDimensionHash";
+  constexpr const char* const kC0 = "c0";
+  constexpr const char* const kC1 = "c1";
+  constexpr const char* const kC2 = "c2";
+
+  KuduSchemaBuilder b;
+  b.AddColumn(kC0)->Type(KuduColumnSchema::INT32)->NotNull();
+  b.AddColumn(kC1)->Type(KuduColumnSchema::INT32)->NotNull();
+  b.AddColumn(kC2)->Type(KuduColumnSchema::STRING)->Nullable();
+  b.SetPrimaryKey({ kC0, kC1 });
+  KuduSchema schema;
+  ASSERT_OK(b.Build(&schema));
+
+  auto rows_inserter = [&](int32_t key_beg, int32_t key_end) {
+    vector<KuduError*> errors;
+    CHECK_LE(key_beg, key_end);
+    shared_ptr<KuduTable> table;
+    RETURN_NOT_OK(client_->OpenTable(kTableName, &table));
+    shared_ptr<KuduSession> session = client_->NewSession();
+    RETURN_NOT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH));
+    session->SetTimeoutMillis(60000);
+    for (int32_t key_val = key_beg; key_val < key_end; ++key_val) {
+      unique_ptr<KuduInsert> insert(table->NewInsert());
+      RETURN_NOT_OK(insert->mutable_row()->SetInt32(kC0, key_val));
+      RETURN_NOT_OK(insert->mutable_row()->SetInt32(kC1, key_val));
+      RETURN_NOT_OK(insert->mutable_row()->SetStringCopy(kC2, 
std::to_string(rand())));
+      RETURN_NOT_OK(session->Apply(insert.release()));
+    }
+    return session->Flush();
+  };
+
+  // Two range partitions: [-inf, 0)  [0, +inf)
+  {
+    unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
+    table_creator->table_name(kTableName)
+        .schema(&schema)
+        .num_replicas(1)
+        .add_hash_partitions({ kC0 }, 3)
+        .add_hash_partitions({ kC1 }, 3)
+        .set_range_partition_columns({ kC0 });
+
+    // Add a range partition with the table-wide hash schema.
+    {
+      unique_ptr<KuduPartialRow> lower(schema.NewRow());
+      unique_ptr<KuduPartialRow> upper(schema.NewRow());
+      ASSERT_OK(upper->SetInt32(kC0, 0));
+      table_creator->add_range_partition(lower.release(), upper.release());
+    }
+
+    // Add a range partition with custom hash schema.
+    {
+      unique_ptr<KuduPartialRow> lower(schema.NewRow());
+      ASSERT_OK(lower->SetInt32(kC0, 0));
+      unique_ptr<KuduRangePartition> p(
+          new KuduRangePartition(lower.release(), schema.NewRow()));
+      ASSERT_OK(p->add_hash_partitions({ kC0 }, 2, 0));
+      ASSERT_OK(p->add_hash_partitions({ kC1 }, 2, 1));
+      table_creator->add_custom_range_partition(p.release());
+    }
+
+    ASSERT_OK(table_creator->Create());
+    NO_FATALS(CheckTabletCount(kTableName, 13));
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client_->OpenTable(kTableName, &table));
+
+    vector<Partition> partitions;
+    ASSERT_OK(table->ListPartitions(&partitions));
+    ASSERT_EQ(13, partitions.size());
+
+    ASSERT_OK(rows_inserter(-100, 100));
+    NO_FATALS(CheckTableRowsNum(kTableName, 200));
+    ASSERT_OK(client_->DeleteTable(kTableName));
+  }
+
+  // Three range partitions: [-inf, -10)  [-10, 10)  [10, +inf)
+  {
+    unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
+    table_creator->table_name(kTableName)
+        .schema(&schema)
+        .num_replicas(1)
+        .add_hash_partitions({ kC0 }, 2)
+        .add_hash_partitions({ kC1 }, 2)
+        .set_range_partition_columns({ kC0 });
+
+    // Add a range partition with the table-wide hash schema.
+    {
+      unique_ptr<KuduPartialRow> lower(schema.NewRow());
+      unique_ptr<KuduPartialRow> upper(schema.NewRow());
+      ASSERT_OK(upper->SetInt32(kC0, -10));
+      table_creator->add_range_partition(lower.release(), upper.release());
+    }
+
+    // Add two range partitions with custom hash schemas.
+    {
+      auto p = CreateRangePartition(schema, kC0, -10, 10);
+      ASSERT_OK(p->add_hash_partitions({ kC0 }, 4, 0));
+      ASSERT_OK(p->add_hash_partitions({ kC1 }, 5, 1));
+      table_creator->add_custom_range_partition(p.release());
+    }
+    {
+      unique_ptr<KuduPartialRow> lower(schema.NewRow());
+      ASSERT_OK(lower->SetInt32(kC0, 10));
+      unique_ptr<KuduRangePartition> p(
+          new KuduRangePartition(lower.release(), schema.NewRow()));
+      ASSERT_OK(p->add_hash_partitions({ kC0 }, 3, 2));
+      ASSERT_OK(p->add_hash_partitions({ kC1 }, 4, 3));
+      table_creator->add_custom_range_partition(p.release());
+    }
+
+    ASSERT_OK(table_creator->Create());
+    NO_FATALS(CheckTabletCount(kTableName, 36));  // 4 + 20 + 12 = 36
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client_->OpenTable(kTableName, &table));
+
+    vector<Partition> partitions;
+    ASSERT_OK(table->ListPartitions(&partitions));
+    ASSERT_EQ(36, partitions.size());
+
+    ASSERT_OK(rows_inserter(-250, 250));
+    NO_FATALS(CheckTableRowsNum(kTableName, 500));
+    ASSERT_OK(client_->DeleteTable(kTableName));
+  }
+}
+
 // When working with a cluster that doesn't support range-specific hash schemas
 // for tables, the client should receive proper error while trying to create
 // a table with custom hash schema for at least one of its ranges.
diff --git a/src/kudu/common/partition-test.cc 
b/src/kudu/common/partition-test.cc
index b0db9208c..a230bdc37 100644
--- a/src/kudu/common/partition-test.cc
+++ b/src/kudu/common/partition-test.cc
@@ -468,9 +468,9 @@ TEST_F(PartitionTest, TestCreateHashPartitions) {
 
   // Encoded Partition Keys:
   //
-  // [ (_), (1) )
+  // [ (0), (1) )
   // [ (1), (2) )
-  // [ (2), (_) )
+  // [ (2), (3) )
 
   vector<Partition> partitions;
   ASSERT_OK(
@@ -480,7 +480,7 @@ TEST_F(PartitionTest, TestCreateHashPartitions) {
   EXPECT_EQ(0, partitions[0].hash_buckets()[0]);
   EXPECT_TRUE(partitions[0].begin().range_key().empty());
   EXPECT_TRUE(partitions[0].end().range_key().empty());
-  EXPECT_TRUE(partitions[0].begin().empty());
+  EXPECT_EQ(string("\0\0\0\0", 4), partitions[0].begin().ToString());
   EXPECT_EQ(string("\0\0\0\1", 4), partitions[0].end().ToString());
   EXPECT_EQ("HASH (a) PARTITION 0",
             partition_schema.PartitionDebugString(partitions[0], schema));
@@ -497,7 +497,7 @@ TEST_F(PartitionTest, TestCreateHashPartitions) {
   EXPECT_TRUE(partitions[2].begin().range_key().empty());
   EXPECT_TRUE(partitions[2].end().range_key().empty());
   EXPECT_EQ(string("\0\0\0\2", 4), partitions[2].begin().ToString());
-  EXPECT_EQ("", partitions[2].end().ToString());
+  EXPECT_EQ(string("\0\0\0\3", 4), partitions[2].end().ToString());
   EXPECT_EQ("HASH (a) PARTITION 2",
             partition_schema.PartitionDebugString(partitions[2], schema));
 }
@@ -533,21 +533,21 @@ TEST_F(PartitionTest, TestCreatePartitions) {
   //
   // Encoded Partition Keys:
   //
-  // [ (_, _,        _), (0, 0, "a1b1c1") )
+  // [ (0, 0,        _), (0, 0, "a1b1c1") )
   // [ (0, 0, "a1b1c1"), (0, 0,   "a2b2") )
   // [ (0, 0,   "a2b2"), (0, 1,        _) )
   //
   // [ (0, 1,        _), (0, 1, "a1b1c1") )
   // [ (0, 1, "a1b1c1"), (0, 1,   "a2b2") )
-  // [ (0, 1,   "a2b2"), (1, _,        _) )
+  // [ (0, 1,   "a2b2"), (0, 2,        _) )
   //
-  // [ (1, _,        _), (1, 0, "a1b1c1") )
+  // [ (1, 0,        _), (1, 0, "a1b1c1") )
   // [ (1, 0, "a1b1c1"), (1, 0,   "a2b2") )
   // [ (1, 0,   "a2b2"), (1, 1,        _) )
   //
   // [ (1, 1,        _), (1, 1, "a1b1c1") )
   // [ (1, 1, "a1b1c1"), (1, 1,   "a2b2") )
-  // [ (1, 1,   "a2b2"), (_, _,        _) )
+  // [ (1, 1,   "a2b2"), (1, 2,        _) )
   //
   // _ signifies that the value is omitted from the encoded partition key.
 
@@ -570,7 +570,7 @@ TEST_F(PartitionTest, TestCreatePartitions) {
   EXPECT_EQ(0, partitions[0].hash_buckets()[1]);
   EXPECT_EQ("", partitions[0].begin().range_key());
   EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[0].end().range_key());
-  EXPECT_EQ("", partitions[0].begin().ToString());
+  EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0", 8), 
partitions[0].begin().ToString());
   EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0" "a1\0\0b1\0\0c1", 18),
             partitions[0].end().ToString());
   EXPECT_EQ("HASH (a) PARTITION 0, HASH (b) PARTITION 0, "
@@ -639,7 +639,7 @@ TEST_F(PartitionTest, TestCreatePartitions) {
   EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[5].begin().range_key());
   EXPECT_EQ("", partitions[5].end().range_key());
   EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1" "a2\0\0b2\0\0", 16), 
partitions[5].begin().ToString());
-  EXPECT_EQ(string("\0\0\0\1", 4), partitions[5].end().ToString());
+  EXPECT_EQ(string("\0\0\0\0" "\0\0\0\2", 8), partitions[5].end().ToString());
   EXPECT_EQ("HASH (a) PARTITION 0, HASH (b) PARTITION 1, "
             R"(RANGE (a, b, c) PARTITION ("a2", "b2", "") <= VALUES)",
             partition_schema.PartitionDebugString(partitions[5], schema));
@@ -651,7 +651,7 @@ TEST_F(PartitionTest, TestCreatePartitions) {
   EXPECT_EQ(0, partitions[6].hash_buckets()[1]);
   EXPECT_EQ("", partitions[6].begin().range_key());
   EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[6].end().range_key());
-  EXPECT_EQ(string("\0\0\0\1", 4), partitions[6].begin().ToString());
+  EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0", 8), 
partitions[6].begin().ToString());
   EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0" "a1\0\0b1\0\0c1", 18), 
partitions[6].end().ToString());
   EXPECT_EQ("HASH (a) PARTITION 1, HASH (b) PARTITION 0, "
             R"(RANGE (a, b, c) PARTITION VALUES < ("a1", "b1", "c1"))",
@@ -719,7 +719,7 @@ TEST_F(PartitionTest, TestCreatePartitions) {
   EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[11].begin().range_key());
   EXPECT_EQ("", partitions[11].end().range_key());
   EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1" "a2\0\0b2\0\0", 16), 
partitions[11].begin().ToString());
-  EXPECT_EQ("", partitions[11].end().ToString());
+  EXPECT_EQ(string("\0\0\0\1" "\0\0\0\2", 8), partitions[11].end().ToString());
   EXPECT_EQ("HASH (a) PARTITION 1, HASH (b) PARTITION 1, "
             R"(RANGE (a, b, c) PARTITION ("a2", "b2", "") <= VALUES)",
             partition_schema.PartitionDebugString(partitions[11], schema));
@@ -1297,7 +1297,7 @@ TEST_F(PartitionTest, 
VaryingHashSchemasPerUnboundedRanges) {
   EXPECT_EQ(0, partitions[0].hash_buckets()[0]);
   EXPECT_EQ("", partitions[0].begin().range_key());
   EXPECT_EQ(string("a1\0\0\0\0c1", 8), partitions[0].end().range_key());
-  EXPECT_EQ("", partitions[0].begin().ToString());
+  EXPECT_EQ(string("\0\0\0\0", 4), partitions[0].begin().ToString());
   EXPECT_EQ(string("\0\0\0\0" "a1\0\0\0\0c1", 12), 
partitions[0].end().ToString());
 
   ASSERT_EQ(1, partitions[1].hash_buckets().size());
@@ -1349,7 +1349,7 @@ TEST_F(PartitionTest, 
VaryingHashSchemasPerUnboundedRanges) {
   EXPECT_EQ(string("a4\0\0b4\0\0", 8), partitions[7].begin().range_key());
   EXPECT_EQ("", partitions[7].end().range_key());
   EXPECT_EQ(string("\0\0\0\0" "\0\0\0\2" "a4\0\0b4\0\0", 16), 
partitions[7].begin().ToString());
-  EXPECT_EQ(string("\0\0\0\1", 4), partitions[7].end().ToString());
+  EXPECT_EQ(string("\0\0\0\0" "\0\0\0\3", 8), partitions[7].end().ToString());
 
   ASSERT_EQ(2, partitions[8].hash_buckets().size());
   EXPECT_EQ(1, partitions[8].hash_buckets()[0]);
@@ -1373,7 +1373,7 @@ TEST_F(PartitionTest, 
VaryingHashSchemasPerUnboundedRanges) {
   EXPECT_EQ(string("a4\0\0b4\0\0", 8), partitions[10].begin().range_key());
   EXPECT_EQ("", partitions[10].end().range_key());
   EXPECT_EQ(string("\0\0\0\1" "\0\0\0\2" "a4\0\0b4\0\0", 16), 
partitions[10].begin().ToString());
-  EXPECT_EQ("", partitions[10].end().ToString());
+  EXPECT_EQ(string("\0\0\0\1" "\0\0\0\3", 8), partitions[10].end().ToString());
 }
 
 TEST_F(PartitionTest, NoHashSchemasForLastUnboundedRange) {
@@ -1432,7 +1432,7 @@ TEST_F(PartitionTest, NoHashSchemasForLastUnboundedRange) 
{
     EXPECT_EQ(0, p.hash_buckets()[0]);
     EXPECT_EQ("", p.begin().range_key());
     EXPECT_EQ(string("a1\0\0", 4), p.end().range_key());
-    EXPECT_EQ("", p.begin().ToString());
+    EXPECT_EQ(string("\0\0\0\0", 4), p.begin().ToString());
     EXPECT_EQ(string("\0\0\0\0" "a1\0\0c1", 8), p.end().ToString());
   }
   {
diff --git a/src/kudu/common/partition.cc b/src/kudu/common/partition.cc
index 9aef2e4dc..f666566d6 100644
--- a/src/kudu/common/partition.cc
+++ b/src/kudu/common/partition.cc
@@ -1489,58 +1489,90 @@ void PartitionSchema::UpdatePartitionBoundaries(
     const KeyEncoder<string>& hash_encoder,
     const HashSchema& partition_hash_schema,
     Partition* partition) {
+  if (partition_hash_schema.empty()) {
+    // A simple short-circuit in case of an empty hash schema.
+    return;
+  }
+
   // Note: the following discussion and logic only takes effect when the 
table's
-  // partition schema includes at least one hash bucket component, and the
+  // partition schema includes at least one hash dimension component, and the
   // absolute upper and/or absolute lower range bound is unbounded.
   //
-  // At this point, we have the full set of partitions built up, but each
-  // partition only covers a finite slice of the partition key-space. Some
-  // operations involving partitions are easier (pruning, client meta cache) if
-  // it can be assumed that the partition keyspace does not have holes.
+  // At this point, we have the full set of partitions built up, but due
+  // to the convention on the unbounded range keys and how PartitionKey
+  // instances are compared, it's necessary to add an iota to the hash part
+  // of the key for each unbounded range at the right end to keep proper
+  // ordering of the ranges.
+  //
+  // It would be great to make the partition key space continuous
+  // (i.e. without holes) by carrying over iotas to next hash dimension index,
+  // but that would break the proper ordering of the ranges since the
+  // introduction of range-specific hash schemas. For example, consider the
+  // following partitioning:
+  //   [-inf, 0) x 3 buckets x 3 buckets; [0, +inf) x 2 buckets x 2 buckets
+  //
+  // The original set of ranges looks like the following:
+  //
+  //  [ (0, 0,  ""), (0, 0, "0") )
+  //  [ (0, 1,  ""), (0, 1, "0") )
+  //  [ (0, 2,  ""), (0, 2, "0") )
+  //  [ (1, 0,  ""), (1, 0, "0") )
+  //  [ (1, 1,  ""), (1, 1, "0") )
+  //  [ (1, 2,  ""), (1, 2, "0") )
+  //  [ (2, 0,  ""), (2, 0, "0") )
+  //  [ (2, 1,  ""), (2, 1, "0") )
+  //  [ (2, 2,  ""), (2, 2, "0") )
+  //
+  //  [ (0, 0, "0"), (0, 0,  "") )
+  //  [ (0, 1, "0"), (0, 1,  "") )
+  //  [ (1, 0, "0"), (1, 0,  "") )
+  //  [ (1, 1, "0"), (1, 1,  "") )
+  //
+  // Below is the list of the same partitions, updated and sorted:
+  //  [ (0, 0,  ""), (0, 0, "0") )
+  //  [ (0, 0, "0"), (0, 1,  "") )
+  //  [ (0, 1,  ""), (0, 1, "0") )
+  //  [ (0, 1, "0"), (0, 2,  "") )
+  //  [ (0, 2,  ""), (0, 2, "0") )
+  //
+  //  [ (1, 0,  ""), (1, 0, "0") )
+  //  [ (1, 0, "0"), (1, 1,  "") )
+  //  [ (1, 1,  ""), (1, 1, "0") )
+  //  [ (1, 1, "0"), (1, 2,  "") )
+  //  [ (1, 2,  ""), (1, 2, "0") )
   //
-  // In order to 'fill in' the partition key space, the absolute first and last
-  // partitions are extended to cover the rest of the lower and upper partition
-  // range by clearing the start and end partition key, respectively.
+  //  [ (2, 0,  ""), (2, 0, "0") )
+  //  [ (2, 1,  ""), (2, 1, "0") )
+  //  [ (2, 2,  ""), (2, 2, "0") )
   //
-  // When the table has two or more hash components, there will be gaps in
-  // between partitions at the boundaries of the component ranges. Similar to
-  // the absolute start and end case, these holes are filled by clearing the
-  // partition key beginning at the hash component. For a concrete example,
-  // see PartitionTest::TestCreatePartitions.
+  // An alternate way of updating the hash components that tries to keep the
+  // key space continuous would transform [ (0, 1, "0"), (0, 1,  "") ) into
+  // [ (0, 1, "0"), (1, 0,  "") ), but that would put range
+  // [ (0, 2,  ""), (0, 2, "0") ) inside the transformed one. That's not
+  // consistent since it messes up the interval logic of the partition pruning
+  // and the client's metacache: both are built under the assumption that
+  // partition key ranges backed by different tablets do not intersect.
   DCHECK(partition);
-  auto& p = *partition; // just a handy shortcut
+  auto& p = *partition; // just a handy shortcut for the code below
   CHECK_EQ(partition_hash_schema.size(), p.hash_buckets().size());
-  const auto hash_buckets_num = static_cast<int>(p.hash_buckets().size());
-  // Find the first nonzero-valued bucket from the end and truncate the
-  // partition key starting from that bucket onwards for zero-valued buckets.
-  // This is necessary to complement the truncation performed below for the
-  // hash part of the partition's end key below.
-  if (p.begin().range_key().empty()) {
-    for (int i = hash_buckets_num - 1; i >= 0; --i) {
-      if (p.hash_buckets()[i] != 0) {
-        break;
-      }
-      p.begin_.mutable_hash_key()->erase(kEncodedBucketSize * i);
-    }
-  }
-  // Starting from the last hash bucket, truncate the partition key until we 
hit the first
-  // non-max-valued bucket, at which point, replace the encoding with the 
next-incremented
-  // bucket value. For example, the following range end partition keys should 
be transformed,
-  // where the key is (hash_comp1, hash_comp2, range_comp):
+  const auto hash_dimensions_num = static_cast<int>(p.hash_buckets().size());
+
+  // Increment the hash bucket component of the last hash dimension for
+  // each range that has the unbounded end range key.
   //
-  // [ (0, 0, "") -> (0, 1, "") ]
-  // [ (0, 1, "") -> (1, _, "") ]
-  // [ (1, 0, "") -> (1, 1, "") ]
-  // [ (1, 1, "") -> (_, _, "") ]
+  // For example, the following is the result of the range end partition keys'
+  // transformation, where the key is (hash_comp1, hash_comp2, range_comp) and
+  // the number of hush buckets per each dimensions is 2:
+  //   [ (0, 0, "") -> (0, 1, "") ]
+  //   [ (0, 1, "") -> (0, 2, "") ]
+  //   [ (1, 0, "") -> (1, 1, "") ]
+  //   [ (1, 1, "") -> (1, 2, "") ]
   if (p.end().range_key().empty()) {
-    for (int i = hash_buckets_num - 1; i >= 0; --i) {
-      p.end_.mutable_hash_key()->erase(kEncodedBucketSize * i);
-      const int32_t hash_bucket = p.hash_buckets()[i] + 1;
-      if (hash_bucket != partition_hash_schema[i].num_buckets) {
-        hash_encoder.Encode(&hash_bucket, p.end_.mutable_hash_key());
-        break;
-      }
-    }
+    const int dimension_idx = hash_dimensions_num - 1;
+    DCHECK_GE(dimension_idx, 0);
+    p.end_.mutable_hash_key()->erase(kEncodedBucketSize * dimension_idx);
+    const int32_t bucket = p.hash_buckets()[dimension_idx] + 1;
+    hash_encoder.Encode(&bucket, p.end_.mutable_hash_key());
   }
 }

[kudu] 01/02: KUDU-2671 fix updating partition end keys for unbounded ranges

Reply via email to