This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 8df970f7a6520bb0dc0f9cc89ad7f62ab349e84d Author: Alexey Serbin <ale...@apache.org> AuthorDate: Fri Jul 29 19:57:01 2022 -0700 KUDU-2671 fix updating partition end keys for unbounded ranges This patch fixes an issue with updating the hash bucket components for partitions' end keys in case of unbounded ranges. I also added new test scenarios that allowed to spot the issue. The newly added scenarios would fail if not including the fix. In addition, I updated a few other test scenarios to match the current logic of updating the hash partition component of partition end key. The previous implementation of updating the hash components tried to avoid holes in the partition keyspace by carrying over iotas to next hash dimension index, but with the introduction of range-specific hash schemas it's no longer possible to do so while keeping the result key ranges disjoint. For example, consider the following partitioning: [-inf, 0) x 3 buckets x 3 buckets; [0, +inf) x 2 buckets x 2 buckets The original set of ranges looks like the following: [ (0, 0, ""), (0, 0, "0") ) [ (0, 1, ""), (0, 1, "0") ) [ (0, 2, ""), (0, 2, "0") ) [ (1, 0, ""), (1, 0, "0") ) [ (1, 1, ""), (1, 1, "0") ) [ (1, 2, ""), (1, 2, "0") ) [ (2, 0, ""), (2, 0, "0") ) [ (2, 1, ""), (2, 1, "0") ) [ (2, 2, ""), (2, 2, "0") ) [ (0, 0, "0"), (0, 0, "") ) [ (0, 1, "0"), (0, 1, "") ) [ (1, 0, "0"), (1, 0, "") ) [ (1, 1, "0"), (1, 1, "") ) The previous implementation would transform the key range [ (0, 1, "0"), (0, 1, "") ) into [ (0, 1, "0"), (1, 0, "") ), but that would put the key range [ (0, 2, ""), (0, 2, "0") ) inside the transformed one. That would mess up the interval logic of the partition pruning and the client's metacache. As it turns out, the continuous keyspace was just a nice thing to have since the partition pruning and the client metacache work fine with sparse keyspaces as well. Change-Id: I3775f914a3b7cdc294a26911663c2d848f4e491b Reviewed-on: http://gerrit.cloudera.org:8080/18808 Tested-by: Kudu Jenkins Reviewed-by: Mahesh Reddy <mre...@cloudera.com> Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com> Reviewed-by: Attila Bukor <abu...@apache.org> --- .../org/apache/kudu/client/TestAlterTable.java | 10 +- src/kudu/client/flex_partitioning_client-test.cc | 184 ++++++++++++++++++++- src/kudu/common/partition-test.cc | 32 ++-- src/kudu/common/partition.cc | 118 ++++++++----- 4 files changed, 279 insertions(+), 65 deletions(-) diff --git a/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java b/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java index f0270b446..c10b6402d 100644 --- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java +++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestAlterTable.java @@ -253,12 +253,16 @@ public class TestAlterTable { KuduScanner scanner = client.newScannerBuilder(table) .setProjectedColumnNames(Lists.newArrayList("c0Key", "c1")).build(); + int rowCount = 0; while (scanner.hasMoreRows()) { RowResultIterator it = scanner.nextRows(); - assertTrue(it.hasNext()); - RowResult rr = it.next(); - assertEquals(rr.getInt(0), rr.getInt(1)); + while (it.hasNext()) { + RowResult rr = it.next(); + assertEquals(rr.getInt(0), rr.getInt(1)); + ++rowCount; + } } + assertEquals(101, rowCount); } @Test diff --git a/src/kudu/client/flex_partitioning_client-test.cc b/src/kudu/client/flex_partitioning_client-test.cc index 6b42eff1b..66d504a83 100644 --- a/src/kudu/client/flex_partitioning_client-test.cc +++ b/src/kudu/client/flex_partitioning_client-test.cc @@ -27,14 +27,15 @@ #include <glog/logging.h> #include <gtest/gtest.h> -#include "kudu/client/client.h" #include "kudu/client/client-test-util.h" +#include "kudu/client/client.h" #include "kudu/client/scan_batch.h" #include "kudu/client/scan_predicate.h" #include "kudu/client/schema.h" #include "kudu/client/value.h" #include "kudu/client/write_op.h" #include "kudu/common/partial_row.h" +#include "kudu/common/partition.h" #include "kudu/gutil/port.h" #include "kudu/gutil/ref_counted.h" #include "kudu/gutil/stl_util.h" @@ -43,14 +44,14 @@ #include "kudu/master/master.h" #include "kudu/master/mini_master.h" #include "kudu/mini-cluster/internal_mini_cluster.h" -#include "kudu/tablet/tablet_replica.h" #include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_replica.h" #include "kudu/tserver/mini_tablet_server.h" #include "kudu/tserver/tablet_server.h" #include "kudu/tserver/ts_tablet_manager.h" -#include "kudu/util/metrics.h" #include "kudu/util/curl_util.h" #include "kudu/util/faststring.h" +#include "kudu/util/metrics.h" #include "kudu/util/net/sockaddr.h" #include "kudu/util/slice.h" #include "kudu/util/status.h" @@ -1562,6 +1563,183 @@ TEST_F(FlexPartitioningCreateTableTest, Negatives) { } } +// This test scenario verifies that ListPartitions() returns correct number +// of partitions for tables with unbounded ranges with custom hash schemas. +// Essentially, this scenario makes sure the logic used by the code to iterate +// over range partitions by employing PartitionPruner's NextPartitionKey() and +// RemovePartitionKeyRange() methods works as expected. +TEST_F(FlexPartitioningCreateTableTest, UnboundedRangesOneDimensionHash) { + constexpr const char* const kTableName = "UnboundedRangesOneDimensionHash"; + + unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator()); + table_creator->table_name(kTableName) + .schema(&schema_) + .num_replicas(1) + .add_hash_partitions({ kKeyColumn }, 2) + .set_range_partition_columns({ kKeyColumn }); + + // Add a range partition with the table-wide hash schema. + { + unique_ptr<KuduPartialRow> lower(schema_.NewRow()); + unique_ptr<KuduPartialRow> upper(schema_.NewRow()); + ASSERT_OK(upper->SetInt32(kKeyColumn, -10)); + table_creator->add_range_partition(lower.release(), upper.release()); + } + + // Add two range partitions with custom hash schemas. + { + auto p = CreateRangePartition(-10, 10); + ASSERT_OK(p->add_hash_partitions({ kKeyColumn }, 5, 1)); + table_creator->add_custom_range_partition(p.release()); + } + { + auto p = CreateRangePartitionNoUpperBound(10); + ASSERT_OK(p->add_hash_partitions({ kKeyColumn }, 3, 2)); + table_creator->add_custom_range_partition(p.release()); + } + + ASSERT_OK(table_creator->Create()); + NO_FATALS(CheckTabletCount(kTableName, 10)); // 2 + 5 + 3 = 10 + shared_ptr<KuduTable> table; + ASSERT_OK(client_->OpenTable(kTableName, &table)); + + vector<Partition> partitions; + ASSERT_OK(table->ListPartitions(&partitions)); + ASSERT_EQ(10, partitions.size()); + + // Make sure it's possible to insert rows into the table for all the existing + // partitions. + ASSERT_OK(InsertTestRows(kTableName, -50, 50)); + NO_FATALS(CheckTableRowsNum(kTableName, 100)); +} + +// Similar to FlexPartitioningCreateTableTest.UnboundedRangesOneDimensionHash +// abobe, but with two hash dimensions. +TEST_F(FlexPartitioningCreateTableTest, UnboundedRangesTwoDimensionHash) { + constexpr const char* const kTableName = "UnboundedRangesTwoDimensionHash"; + constexpr const char* const kC0 = "c0"; + constexpr const char* const kC1 = "c1"; + constexpr const char* const kC2 = "c2"; + + KuduSchemaBuilder b; + b.AddColumn(kC0)->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn(kC1)->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn(kC2)->Type(KuduColumnSchema::STRING)->Nullable(); + b.SetPrimaryKey({ kC0, kC1 }); + KuduSchema schema; + ASSERT_OK(b.Build(&schema)); + + auto rows_inserter = [&](int32_t key_beg, int32_t key_end) { + vector<KuduError*> errors; + CHECK_LE(key_beg, key_end); + shared_ptr<KuduTable> table; + RETURN_NOT_OK(client_->OpenTable(kTableName, &table)); + shared_ptr<KuduSession> session = client_->NewSession(); + RETURN_NOT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(60000); + for (int32_t key_val = key_beg; key_val < key_end; ++key_val) { + unique_ptr<KuduInsert> insert(table->NewInsert()); + RETURN_NOT_OK(insert->mutable_row()->SetInt32(kC0, key_val)); + RETURN_NOT_OK(insert->mutable_row()->SetInt32(kC1, key_val)); + RETURN_NOT_OK(insert->mutable_row()->SetStringCopy(kC2, std::to_string(rand()))); + RETURN_NOT_OK(session->Apply(insert.release())); + } + return session->Flush(); + }; + + // Two range partitions: [-inf, 0) [0, +inf) + { + unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator()); + table_creator->table_name(kTableName) + .schema(&schema) + .num_replicas(1) + .add_hash_partitions({ kC0 }, 3) + .add_hash_partitions({ kC1 }, 3) + .set_range_partition_columns({ kC0 }); + + // Add a range partition with the table-wide hash schema. + { + unique_ptr<KuduPartialRow> lower(schema.NewRow()); + unique_ptr<KuduPartialRow> upper(schema.NewRow()); + ASSERT_OK(upper->SetInt32(kC0, 0)); + table_creator->add_range_partition(lower.release(), upper.release()); + } + + // Add a range partition with custom hash schema. + { + unique_ptr<KuduPartialRow> lower(schema.NewRow()); + ASSERT_OK(lower->SetInt32(kC0, 0)); + unique_ptr<KuduRangePartition> p( + new KuduRangePartition(lower.release(), schema.NewRow())); + ASSERT_OK(p->add_hash_partitions({ kC0 }, 2, 0)); + ASSERT_OK(p->add_hash_partitions({ kC1 }, 2, 1)); + table_creator->add_custom_range_partition(p.release()); + } + + ASSERT_OK(table_creator->Create()); + NO_FATALS(CheckTabletCount(kTableName, 13)); + shared_ptr<KuduTable> table; + ASSERT_OK(client_->OpenTable(kTableName, &table)); + + vector<Partition> partitions; + ASSERT_OK(table->ListPartitions(&partitions)); + ASSERT_EQ(13, partitions.size()); + + ASSERT_OK(rows_inserter(-100, 100)); + NO_FATALS(CheckTableRowsNum(kTableName, 200)); + ASSERT_OK(client_->DeleteTable(kTableName)); + } + + // Three range partitions: [-inf, -10) [-10, 10) [10, +inf) + { + unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator()); + table_creator->table_name(kTableName) + .schema(&schema) + .num_replicas(1) + .add_hash_partitions({ kC0 }, 2) + .add_hash_partitions({ kC1 }, 2) + .set_range_partition_columns({ kC0 }); + + // Add a range partition with the table-wide hash schema. + { + unique_ptr<KuduPartialRow> lower(schema.NewRow()); + unique_ptr<KuduPartialRow> upper(schema.NewRow()); + ASSERT_OK(upper->SetInt32(kC0, -10)); + table_creator->add_range_partition(lower.release(), upper.release()); + } + + // Add two range partitions with custom hash schemas. + { + auto p = CreateRangePartition(schema, kC0, -10, 10); + ASSERT_OK(p->add_hash_partitions({ kC0 }, 4, 0)); + ASSERT_OK(p->add_hash_partitions({ kC1 }, 5, 1)); + table_creator->add_custom_range_partition(p.release()); + } + { + unique_ptr<KuduPartialRow> lower(schema.NewRow()); + ASSERT_OK(lower->SetInt32(kC0, 10)); + unique_ptr<KuduRangePartition> p( + new KuduRangePartition(lower.release(), schema.NewRow())); + ASSERT_OK(p->add_hash_partitions({ kC0 }, 3, 2)); + ASSERT_OK(p->add_hash_partitions({ kC1 }, 4, 3)); + table_creator->add_custom_range_partition(p.release()); + } + + ASSERT_OK(table_creator->Create()); + NO_FATALS(CheckTabletCount(kTableName, 36)); // 4 + 20 + 12 = 36 + shared_ptr<KuduTable> table; + ASSERT_OK(client_->OpenTable(kTableName, &table)); + + vector<Partition> partitions; + ASSERT_OK(table->ListPartitions(&partitions)); + ASSERT_EQ(36, partitions.size()); + + ASSERT_OK(rows_inserter(-250, 250)); + NO_FATALS(CheckTableRowsNum(kTableName, 500)); + ASSERT_OK(client_->DeleteTable(kTableName)); + } +} + // When working with a cluster that doesn't support range-specific hash schemas // for tables, the client should receive proper error while trying to create // a table with custom hash schema for at least one of its ranges. diff --git a/src/kudu/common/partition-test.cc b/src/kudu/common/partition-test.cc index b0db9208c..a230bdc37 100644 --- a/src/kudu/common/partition-test.cc +++ b/src/kudu/common/partition-test.cc @@ -468,9 +468,9 @@ TEST_F(PartitionTest, TestCreateHashPartitions) { // Encoded Partition Keys: // - // [ (_), (1) ) + // [ (0), (1) ) // [ (1), (2) ) - // [ (2), (_) ) + // [ (2), (3) ) vector<Partition> partitions; ASSERT_OK( @@ -480,7 +480,7 @@ TEST_F(PartitionTest, TestCreateHashPartitions) { EXPECT_EQ(0, partitions[0].hash_buckets()[0]); EXPECT_TRUE(partitions[0].begin().range_key().empty()); EXPECT_TRUE(partitions[0].end().range_key().empty()); - EXPECT_TRUE(partitions[0].begin().empty()); + EXPECT_EQ(string("\0\0\0\0", 4), partitions[0].begin().ToString()); EXPECT_EQ(string("\0\0\0\1", 4), partitions[0].end().ToString()); EXPECT_EQ("HASH (a) PARTITION 0", partition_schema.PartitionDebugString(partitions[0], schema)); @@ -497,7 +497,7 @@ TEST_F(PartitionTest, TestCreateHashPartitions) { EXPECT_TRUE(partitions[2].begin().range_key().empty()); EXPECT_TRUE(partitions[2].end().range_key().empty()); EXPECT_EQ(string("\0\0\0\2", 4), partitions[2].begin().ToString()); - EXPECT_EQ("", partitions[2].end().ToString()); + EXPECT_EQ(string("\0\0\0\3", 4), partitions[2].end().ToString()); EXPECT_EQ("HASH (a) PARTITION 2", partition_schema.PartitionDebugString(partitions[2], schema)); } @@ -533,21 +533,21 @@ TEST_F(PartitionTest, TestCreatePartitions) { // // Encoded Partition Keys: // - // [ (_, _, _), (0, 0, "a1b1c1") ) + // [ (0, 0, _), (0, 0, "a1b1c1") ) // [ (0, 0, "a1b1c1"), (0, 0, "a2b2") ) // [ (0, 0, "a2b2"), (0, 1, _) ) // // [ (0, 1, _), (0, 1, "a1b1c1") ) // [ (0, 1, "a1b1c1"), (0, 1, "a2b2") ) - // [ (0, 1, "a2b2"), (1, _, _) ) + // [ (0, 1, "a2b2"), (0, 2, _) ) // - // [ (1, _, _), (1, 0, "a1b1c1") ) + // [ (1, 0, _), (1, 0, "a1b1c1") ) // [ (1, 0, "a1b1c1"), (1, 0, "a2b2") ) // [ (1, 0, "a2b2"), (1, 1, _) ) // // [ (1, 1, _), (1, 1, "a1b1c1") ) // [ (1, 1, "a1b1c1"), (1, 1, "a2b2") ) - // [ (1, 1, "a2b2"), (_, _, _) ) + // [ (1, 1, "a2b2"), (1, 2, _) ) // // _ signifies that the value is omitted from the encoded partition key. @@ -570,7 +570,7 @@ TEST_F(PartitionTest, TestCreatePartitions) { EXPECT_EQ(0, partitions[0].hash_buckets()[1]); EXPECT_EQ("", partitions[0].begin().range_key()); EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[0].end().range_key()); - EXPECT_EQ("", partitions[0].begin().ToString()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0", 8), partitions[0].begin().ToString()); EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0" "a1\0\0b1\0\0c1", 18), partitions[0].end().ToString()); EXPECT_EQ("HASH (a) PARTITION 0, HASH (b) PARTITION 0, " @@ -639,7 +639,7 @@ TEST_F(PartitionTest, TestCreatePartitions) { EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[5].begin().range_key()); EXPECT_EQ("", partitions[5].end().range_key()); EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1" "a2\0\0b2\0\0", 16), partitions[5].begin().ToString()); - EXPECT_EQ(string("\0\0\0\1", 4), partitions[5].end().ToString()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\2", 8), partitions[5].end().ToString()); EXPECT_EQ("HASH (a) PARTITION 0, HASH (b) PARTITION 1, " R"(RANGE (a, b, c) PARTITION ("a2", "b2", "") <= VALUES)", partition_schema.PartitionDebugString(partitions[5], schema)); @@ -651,7 +651,7 @@ TEST_F(PartitionTest, TestCreatePartitions) { EXPECT_EQ(0, partitions[6].hash_buckets()[1]); EXPECT_EQ("", partitions[6].begin().range_key()); EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[6].end().range_key()); - EXPECT_EQ(string("\0\0\0\1", 4), partitions[6].begin().ToString()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0", 8), partitions[6].begin().ToString()); EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0" "a1\0\0b1\0\0c1", 18), partitions[6].end().ToString()); EXPECT_EQ("HASH (a) PARTITION 1, HASH (b) PARTITION 0, " R"(RANGE (a, b, c) PARTITION VALUES < ("a1", "b1", "c1"))", @@ -719,7 +719,7 @@ TEST_F(PartitionTest, TestCreatePartitions) { EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[11].begin().range_key()); EXPECT_EQ("", partitions[11].end().range_key()); EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1" "a2\0\0b2\0\0", 16), partitions[11].begin().ToString()); - EXPECT_EQ("", partitions[11].end().ToString()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\2", 8), partitions[11].end().ToString()); EXPECT_EQ("HASH (a) PARTITION 1, HASH (b) PARTITION 1, " R"(RANGE (a, b, c) PARTITION ("a2", "b2", "") <= VALUES)", partition_schema.PartitionDebugString(partitions[11], schema)); @@ -1297,7 +1297,7 @@ TEST_F(PartitionTest, VaryingHashSchemasPerUnboundedRanges) { EXPECT_EQ(0, partitions[0].hash_buckets()[0]); EXPECT_EQ("", partitions[0].begin().range_key()); EXPECT_EQ(string("a1\0\0\0\0c1", 8), partitions[0].end().range_key()); - EXPECT_EQ("", partitions[0].begin().ToString()); + EXPECT_EQ(string("\0\0\0\0", 4), partitions[0].begin().ToString()); EXPECT_EQ(string("\0\0\0\0" "a1\0\0\0\0c1", 12), partitions[0].end().ToString()); ASSERT_EQ(1, partitions[1].hash_buckets().size()); @@ -1349,7 +1349,7 @@ TEST_F(PartitionTest, VaryingHashSchemasPerUnboundedRanges) { EXPECT_EQ(string("a4\0\0b4\0\0", 8), partitions[7].begin().range_key()); EXPECT_EQ("", partitions[7].end().range_key()); EXPECT_EQ(string("\0\0\0\0" "\0\0\0\2" "a4\0\0b4\0\0", 16), partitions[7].begin().ToString()); - EXPECT_EQ(string("\0\0\0\1", 4), partitions[7].end().ToString()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\3", 8), partitions[7].end().ToString()); ASSERT_EQ(2, partitions[8].hash_buckets().size()); EXPECT_EQ(1, partitions[8].hash_buckets()[0]); @@ -1373,7 +1373,7 @@ TEST_F(PartitionTest, VaryingHashSchemasPerUnboundedRanges) { EXPECT_EQ(string("a4\0\0b4\0\0", 8), partitions[10].begin().range_key()); EXPECT_EQ("", partitions[10].end().range_key()); EXPECT_EQ(string("\0\0\0\1" "\0\0\0\2" "a4\0\0b4\0\0", 16), partitions[10].begin().ToString()); - EXPECT_EQ("", partitions[10].end().ToString()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\3", 8), partitions[10].end().ToString()); } TEST_F(PartitionTest, NoHashSchemasForLastUnboundedRange) { @@ -1432,7 +1432,7 @@ TEST_F(PartitionTest, NoHashSchemasForLastUnboundedRange) { EXPECT_EQ(0, p.hash_buckets()[0]); EXPECT_EQ("", p.begin().range_key()); EXPECT_EQ(string("a1\0\0", 4), p.end().range_key()); - EXPECT_EQ("", p.begin().ToString()); + EXPECT_EQ(string("\0\0\0\0", 4), p.begin().ToString()); EXPECT_EQ(string("\0\0\0\0" "a1\0\0c1", 8), p.end().ToString()); } { diff --git a/src/kudu/common/partition.cc b/src/kudu/common/partition.cc index 9aef2e4dc..f666566d6 100644 --- a/src/kudu/common/partition.cc +++ b/src/kudu/common/partition.cc @@ -1489,58 +1489,90 @@ void PartitionSchema::UpdatePartitionBoundaries( const KeyEncoder<string>& hash_encoder, const HashSchema& partition_hash_schema, Partition* partition) { + if (partition_hash_schema.empty()) { + // A simple short-circuit in case of an empty hash schema. + return; + } + // Note: the following discussion and logic only takes effect when the table's - // partition schema includes at least one hash bucket component, and the + // partition schema includes at least one hash dimension component, and the // absolute upper and/or absolute lower range bound is unbounded. // - // At this point, we have the full set of partitions built up, but each - // partition only covers a finite slice of the partition key-space. Some - // operations involving partitions are easier (pruning, client meta cache) if - // it can be assumed that the partition keyspace does not have holes. + // At this point, we have the full set of partitions built up, but due + // to the convention on the unbounded range keys and how PartitionKey + // instances are compared, it's necessary to add an iota to the hash part + // of the key for each unbounded range at the right end to keep proper + // ordering of the ranges. + // + // It would be great to make the partition key space continuous + // (i.e. without holes) by carrying over iotas to next hash dimension index, + // but that would break the proper ordering of the ranges since the + // introduction of range-specific hash schemas. For example, consider the + // following partitioning: + // [-inf, 0) x 3 buckets x 3 buckets; [0, +inf) x 2 buckets x 2 buckets + // + // The original set of ranges looks like the following: + // + // [ (0, 0, ""), (0, 0, "0") ) + // [ (0, 1, ""), (0, 1, "0") ) + // [ (0, 2, ""), (0, 2, "0") ) + // [ (1, 0, ""), (1, 0, "0") ) + // [ (1, 1, ""), (1, 1, "0") ) + // [ (1, 2, ""), (1, 2, "0") ) + // [ (2, 0, ""), (2, 0, "0") ) + // [ (2, 1, ""), (2, 1, "0") ) + // [ (2, 2, ""), (2, 2, "0") ) + // + // [ (0, 0, "0"), (0, 0, "") ) + // [ (0, 1, "0"), (0, 1, "") ) + // [ (1, 0, "0"), (1, 0, "") ) + // [ (1, 1, "0"), (1, 1, "") ) + // + // Below is the list of the same partitions, updated and sorted: + // [ (0, 0, ""), (0, 0, "0") ) + // [ (0, 0, "0"), (0, 1, "") ) + // [ (0, 1, ""), (0, 1, "0") ) + // [ (0, 1, "0"), (0, 2, "") ) + // [ (0, 2, ""), (0, 2, "0") ) + // + // [ (1, 0, ""), (1, 0, "0") ) + // [ (1, 0, "0"), (1, 1, "") ) + // [ (1, 1, ""), (1, 1, "0") ) + // [ (1, 1, "0"), (1, 2, "") ) + // [ (1, 2, ""), (1, 2, "0") ) // - // In order to 'fill in' the partition key space, the absolute first and last - // partitions are extended to cover the rest of the lower and upper partition - // range by clearing the start and end partition key, respectively. + // [ (2, 0, ""), (2, 0, "0") ) + // [ (2, 1, ""), (2, 1, "0") ) + // [ (2, 2, ""), (2, 2, "0") ) // - // When the table has two or more hash components, there will be gaps in - // between partitions at the boundaries of the component ranges. Similar to - // the absolute start and end case, these holes are filled by clearing the - // partition key beginning at the hash component. For a concrete example, - // see PartitionTest::TestCreatePartitions. + // An alternate way of updating the hash components that tries to keep the + // key space continuous would transform [ (0, 1, "0"), (0, 1, "") ) into + // [ (0, 1, "0"), (1, 0, "") ), but that would put range + // [ (0, 2, ""), (0, 2, "0") ) inside the transformed one. That's not + // consistent since it messes up the interval logic of the partition pruning + // and the client's metacache: both are built under the assumption that + // partition key ranges backed by different tablets do not intersect. DCHECK(partition); - auto& p = *partition; // just a handy shortcut + auto& p = *partition; // just a handy shortcut for the code below CHECK_EQ(partition_hash_schema.size(), p.hash_buckets().size()); - const auto hash_buckets_num = static_cast<int>(p.hash_buckets().size()); - // Find the first nonzero-valued bucket from the end and truncate the - // partition key starting from that bucket onwards for zero-valued buckets. - // This is necessary to complement the truncation performed below for the - // hash part of the partition's end key below. - if (p.begin().range_key().empty()) { - for (int i = hash_buckets_num - 1; i >= 0; --i) { - if (p.hash_buckets()[i] != 0) { - break; - } - p.begin_.mutable_hash_key()->erase(kEncodedBucketSize * i); - } - } - // Starting from the last hash bucket, truncate the partition key until we hit the first - // non-max-valued bucket, at which point, replace the encoding with the next-incremented - // bucket value. For example, the following range end partition keys should be transformed, - // where the key is (hash_comp1, hash_comp2, range_comp): + const auto hash_dimensions_num = static_cast<int>(p.hash_buckets().size()); + + // Increment the hash bucket component of the last hash dimension for + // each range that has the unbounded end range key. // - // [ (0, 0, "") -> (0, 1, "") ] - // [ (0, 1, "") -> (1, _, "") ] - // [ (1, 0, "") -> (1, 1, "") ] - // [ (1, 1, "") -> (_, _, "") ] + // For example, the following is the result of the range end partition keys' + // transformation, where the key is (hash_comp1, hash_comp2, range_comp) and + // the number of hush buckets per each dimensions is 2: + // [ (0, 0, "") -> (0, 1, "") ] + // [ (0, 1, "") -> (0, 2, "") ] + // [ (1, 0, "") -> (1, 1, "") ] + // [ (1, 1, "") -> (1, 2, "") ] if (p.end().range_key().empty()) { - for (int i = hash_buckets_num - 1; i >= 0; --i) { - p.end_.mutable_hash_key()->erase(kEncodedBucketSize * i); - const int32_t hash_bucket = p.hash_buckets()[i] + 1; - if (hash_bucket != partition_hash_schema[i].num_buckets) { - hash_encoder.Encode(&hash_bucket, p.end_.mutable_hash_key()); - break; - } - } + const int dimension_idx = hash_dimensions_num - 1; + DCHECK_GE(dimension_idx, 0); + p.end_.mutable_hash_key()->erase(kEncodedBucketSize * dimension_idx); + const int32_t bucket = p.hash_buckets()[dimension_idx] + 1; + hash_encoder.Encode(&bucket, p.end_.mutable_hash_key()); } }