This is an automated email from the ASF dual-hosted git repository. ngangam pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new dc9d1ad0965 HIVE-27595: replaced cartesian-product table filtering by sort + binary search (#4601) dc9d1ad0965 is described below commit dc9d1ad0965ece6a09abdb6481ca113dafce7c75 Author: Henrib <hbies...@gmail.com> AuthorDate: Thu Aug 24 15:33:16 2023 +0200 HIVE-27595: replaced cartesian-product table filtering by sort + binary search (#4601) * HIVE-27595: replaced cartesian-product filtering by sort + binary search lookup reducing complexity from m*n to (m+n)*logn; (Henri Biestro reviewed by John Sherman, Saihemanth) --- .../plugin/AuthorizationMetaStoreFilterHook.java | 23 +- .../plugin/HivePrivilegeObjectUtils.java | 125 ++++++++- .../plugin/metastore/HiveMetaStoreAuthorizer.java | 49 +--- .../metastore/TestHiveMetaStoreAuthorizer.java | 2 +- .../metastore/TestTablePermissionFilterAlgos.java | 282 +++++++++++++++++++++ 5 files changed, 425 insertions(+), 56 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/AuthorizationMetaStoreFilterHook.java b/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/AuthorizationMetaStoreFilterHook.java index ad218af9a05..5723022f62b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/AuthorizationMetaStoreFilterHook.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/AuthorizationMetaStoreFilterHook.java @@ -19,10 +19,6 @@ package org.apache.hadoop.hive.ql.security.authorization.plugin; import java.util.ArrayList; import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.classification.InterfaceAudience.Private; import org.apache.hadoop.hive.metastore.DefaultMetaStoreFilterHookImpl; @@ -31,6 +27,7 @@ import org.apache.hadoop.hive.metastore.api.PrincipalType; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.TableMeta; import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObject.HivePrivilegeObjectType; +import static org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObjectUtils.TablePrivilegeLookup; import org.apache.hadoop.hive.ql.session.SessionState; import org.slf4j.Logger; @@ -162,20 +159,18 @@ public class AuthorizationMetaStoreFilterHook extends DefaultMetaStoreFilterHook return objs; } - private ImmutablePair<String, String> tableMetaKey(String dbName, String tableName) { - return new ImmutablePair(dbName, tableName); - } - @Override public List<TableMeta> filterTableMetas(List<TableMeta> tableMetas) throws MetaException { List<HivePrivilegeObject> listObjs = tableMetasToPrivilegeObjs(tableMetas); List<HivePrivilegeObject> filteredList = getFilteredObjects(listObjs); - Set<ImmutablePair<String, String>> filteredNames = filteredList.stream() - .map(e -> tableMetaKey(e.getDbname(), e.getObjectName())) - .collect(Collectors.toSet()); - return tableMetas.stream() - .filter(e -> filteredNames.contains(tableMetaKey(e.getDbName(), e.getTableName()))) - .collect(Collectors.toList()); + final List<TableMeta> ret = new ArrayList<>(); + final TablePrivilegeLookup index = new TablePrivilegeLookup(filteredList); + for(TableMeta table : tableMetas) { + if (index.lookup(table.getDbName(), table.getTableName()) != null) { + ret.add(table); + } + } + return ret; } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/HivePrivilegeObjectUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/HivePrivilegeObjectUtils.java index 4ab7870e7b3..479dce09ab7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/HivePrivilegeObjectUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/HivePrivilegeObjectUtils.java @@ -19,8 +19,6 @@ package org.apache.hadoop.hive.ql.security.authorization.plugin; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; -import java.util.Iterator; import java.util.List; import org.apache.hadoop.classification.InterfaceStability.Evolving; @@ -59,7 +57,130 @@ public class HivePrivilegeObjectUtils { objs.add(new HivePrivilegeObject(HivePrivilegeObjectType.DATACONNECTOR, null, dcname)); } return objs; + } + + /** + * A helper enabling efficient lookup of a table in a list by database name, table name. + * <p>When filtering a source list of tables by checking their presence in another permission list + * using the database name, table name, we need to avoid performing a cartesian product. This + * product stems from checking each table from the source (n tables) by comparing it to each + * table in the permission list (m permissions), thus an n * m complexity.</p> + * <p>This class reduces the complexity of the lookup in the permission by sorting them and + * using a binary search; the sort cost is m*log(m) and each lookup is log(m), the overall + * complexity of a check for all tables is in the order of m*log(m) + n*log(m) = (m + n)*log(n), + * close to m * log(n).</p> + * + */ + public abstract static class TableLookup<T> { + /** The container. */ + private final T[] index; + + /** + * @param table the table + * @return the database name of the table + */ + protected abstract String getDbName(T table); + + /** + * @param table the table + * @return the table name of the table + */ + protected abstract String getTableName(T table); + + /** + * Compares a table to another by names. + * @param table the table + * @param arg the argument table + * @return < 0, 0, > 0 + */ + private int compareNames(final T table, final T arg) { + int cmp = getDbName(table).compareTo(getDbName(arg)); + if (cmp == 0) { + String argTableName = getTableName(arg); + if (argTableName != null) { + cmp = getTableName(table).compareTo(argTableName); + } + } + return cmp; + } + + /** + * Compares a table to names. + * @param table the table + * @param dbName the argument database name + * @param dbName the argument table name + * @return < 0, 0, > 0 + */ + private int compareNames(final T table, final String dbName, final String tableName) { + int cmp = getDbName(table).compareTo(dbName); + if (cmp == 0 && tableName != null) { + cmp = getTableName(table).compareTo(tableName); + } + return cmp; + } + + /** + * Creates a ByName + * @param tables the tables to consider as a clique + */ + protected TableLookup(List<T> tables) { + if (tables != null) { + index = tables.toArray((T[]) new Object[0]); + // sort them by names + Arrays.sort(index, this::compareNames); + } else { + index = (T[]) new Object[0]; + } + } + /** + * Lookup using dichotomy using order described by database name, table name. + * @param dbName the database name + * @param tableName the table name + * @return the table if found in the index, null otherwise + */ + public final T lookup(final String dbName, final String tableName) { + int low = 0; + int high = index.length - 1; + while (low <= high) { + int mid = (low + high) >>> 1; + T item = index[mid]; + int cmp = compareNames(item, dbName, tableName); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return item; // key found + } + } + return null; // key not found. + } + + /** + * Checks whether a given table is present in this set. + * @param tt the table to check + * @return true if the set contains an item having the same database and table name + */ + public final boolean contains(T tt) { + return lookup(getDbName(tt), getTableName(tt)) != null; + } } + /** + * Specialized lookup for table privilege (HivePrivilegeObject).. + */ + public static class TablePrivilegeLookup extends TableLookup<HivePrivilegeObject> { + public TablePrivilegeLookup(List<HivePrivilegeObject> tables) { + super(tables); + } + + @Override protected String getDbName(HivePrivilegeObject o) { + return o.getDbname(); + } + + @Override protected String getTableName(HivePrivilegeObject o) { + return o.getObjectName(); + } + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/HiveMetaStoreAuthorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/HiveMetaStoreAuthorizer.java index 55a980bc092..7f4cda8d922 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/HiveMetaStoreAuthorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/HiveMetaStoreAuthorizer.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.metastore.api.TableMeta; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.security.HiveMetastoreAuthenticationProvider; +import static org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObjectUtils.TablePrivilegeLookup; import org.apache.hadoop.hive.ql.security.authorization.plugin.metastore.events.*; import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthorizer; import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthorizerFactory; @@ -351,30 +352,16 @@ public class HiveMetaStoreAuthorizer extends MetaStorePreEventListener implement } private List<Table> getFilteredTableList(List<HivePrivilegeObject> hivePrivilegeObjects, List<Table> tableList) { - List<Table> ret = new ArrayList<>(); - for (HivePrivilegeObject hivePrivilegeObject : hivePrivilegeObjects) { - String dbName = hivePrivilegeObject.getDbname(); - String tblName = hivePrivilegeObject.getObjectName(); - Table table = getFilteredTable(dbName, tblName, tableList); - if (table != null) { + final List<Table> ret = new ArrayList<>(); + final TablePrivilegeLookup index = new TablePrivilegeLookup(hivePrivilegeObjects); + for(Table table : tableList) { + if (index.lookup(table.getDbName(), table.getTableName()) != null) { ret.add(table); } } return ret; } - private Table getFilteredTable(String dbName, String tblName, List<Table> tableList) { - Table ret = null; - for (Table table: tableList) { - String databaseName = table.getDbName(); - String tableName = table.getTableName(); - if (dbName.equals(databaseName) && tblName.equals(tableName)) { - ret = table; - break; - } - } - return ret; - } private List<String> filterTableNames(HiveMetaStoreAuthzInfo hiveMetaStoreAuthzInfo, String dbName, List<String> tableNames) throws MetaException { @@ -396,32 +383,16 @@ public class HiveMetaStoreAuthorizer extends MetaStorePreEventListener implement } return ret; } - - private List<String> getFilteredTableNames(List<HivePrivilegeObject> hivePrivilegeObjects, String databaseName, - List<String> tableNames) { + private List<String> getFilteredTableNames(List<HivePrivilegeObject> hivePrivilegeObjects, String databaseName, List<String> tableNames) { List<String> ret = new ArrayList<>(); - for (HivePrivilegeObject hivePrivilegeObject : hivePrivilegeObjects) { - String dbName = hivePrivilegeObject.getDbname(); - String tblName = hivePrivilegeObject.getObjectName(); - String table = getFilteredTableNames(dbName, tblName, databaseName, tableNames); - if (table != null) { - ret.add(table); - } - } - return ret; - } - - private String getFilteredTableNames(String dbName, String tblName, String databaseName, List<String> tableNames) { - String ret = null; - for (String tableName : tableNames) { - if (dbName.equals(databaseName) && tblName.equals(tableName)) { - ret = tableName; - break; + final TablePrivilegeLookup index = new TablePrivilegeLookup(hivePrivilegeObjects); + for(String tableName : tableNames) { + if (index.lookup(databaseName, tableName) != null) { + ret.add(tableName); } } return ret; } - private String getDBName(String str) { return (str != null) ? str.substring(str.indexOf("#")+1) : null; } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestHiveMetaStoreAuthorizer.java b/ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestHiveMetaStoreAuthorizer.java index bb191ba8605..ec9a93c0837 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestHiveMetaStoreAuthorizer.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestHiveMetaStoreAuthorizer.java @@ -36,8 +36,8 @@ import org.junit.FixMethodOrder; import org.junit.runners.MethodSorters; import org.junit.Before; import org.junit.Test; -import java.util.Map; +import java.util.Map; import java.io.File; import static org.junit.Assert.assertEquals; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestTablePermissionFilterAlgos.java b/ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestTablePermissionFilterAlgos.java new file mode 100644 index 00000000000..0f0a2ff996e --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestTablePermissionFilterAlgos.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.security.authorization.plugin.metastore; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObjectUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * This test tries different methods of lookup to try and determine the most efficient in + * the particular usage case related to checking tables against permissions. + * Note that a meaningful performance test requires at least 20 dbs, 10000 tables per db and 40 loops + * on a 2022 laptop. + */ +public class TestTablePermissionFilterAlgos { + private static final Logger LOGGER = LogManager.getLogger(TestHiveMetaStoreAuthorizer.class); + // reduce constants to speed up tests when true, set to false for benchmarking + private static final boolean TEST = true; + // Whether we shuffle table names or not: this has an important effect on sort speed - mostly sorted + // arrays sort faster than completely unsorted ones. + // A hashset of concatenated strings seem to be faster than a sort in that case. + // I suspect something fishy in the cost of string creation (intern()?) since a hashset of pairs is much slower. + // We'll make the semi-educated assumption that most of the time, tablenames come back mostly ordered. + private static final boolean SHUFFLE_TABLENAMES = !TEST; + private static final int NUM_DB = TEST? 2 : 20; + private static final int NUM_TBL = TEST? 100 : 10000; + private static final int NLOOPS = TEST? 1 : 40; + + private final List<TestTable> tables = createTables(NUM_DB, NUM_TBL); + // lookup some + private final List<TestTable> filtered = tables.stream() + .filter(t -> t.getTableName().endsWith("0")) + .collect(Collectors.toList()); + + /** + * Fake tables to test. + */ + private static class TestTable { + private int hashCode = -1; + private final String dbName; + private final String tableName; + + private TestTable(String dbName, String tableName) { + this.dbName = dbName; + this.tableName = tableName; + } + + @Override + public String toString() { + return tableFullName(this); + } + + public String getDbName() { + return dbName; + } + + public String getTableName() { + return tableName; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o instanceof TestTable) { + TestTable tt = (TestTable) o; + return dbName.equals(tt.dbName) && tableName.equals(tt.tableName); + } + return false; + } + + @Override + public int hashCode() { + int h = hashCode; + if (h < 0) { + hashCode = h = Math.abs((31 * dbName.hashCode()) + tableName.hashCode()); + } + return h; + } + } + + /** + * Creates a list of tables. + * @param dbs number of databases + * @param tbls number of tables per database + * @return a list of tables + */ + private static List<TestTable> createTables(int dbs, int tbls) { + Random rnd = new Random(20230815L); + List<Integer> dbis = new ArrayList<>(dbs); + for(int i = 0; i < dbs; ++i) { + dbis.add(i); + } + Collections.shuffle(dbis, rnd); + List<Integer> tblis = new ArrayList<>(tbls); + for(int i = 0; i < tbls; ++i) { + tblis.add(i); + } + if (SHUFFLE_TABLENAMES) { + Collections.shuffle(tblis, rnd); + } + + List<TestTable> tables = new ArrayList<>(dbs * tbls); + for(int d : dbis) { + for(int t : tblis) { + String dbname = String.format("db%05x", d); + String tblName = String.format("tbl%05x", t); + tables.add(new TestTable(dbname, tblName)); + } + } + return tables; + } + + /** + * Allows timing code blocks execution precisely. + */ + static class Chrono { + long start; + long end; + + void start() { + start = System.currentTimeMillis(); + } + void stop() { + end = System.currentTimeMillis(); + } + String elapse() { + return String.format("%.3f", (double) (end - start) / 1000d); + } + } + + /** + * Specialized for TestTable purpose. + */ + private static class TestLookup extends HivePrivilegeObjectUtils.TableLookup<TestTable> { + protected TestLookup(List<TestTable> tables) { + super(tables); + } + + @Override protected String getDbName(TestTable o) { + return o.getDbName(); + } + + @Override protected String getTableName(TestTable o) { + return o.getTableName(); + } + }; + + @Test + public void testIndexedLookup() { + System.gc(); + Chrono chrono = new Chrono(); + chrono.start(); + int cnt = 0; + for(int l = 0; l < NLOOPS; ++l) { + TestLookup index = new TestLookup(tables); + List<TestTable> tts = l % 2 == 1 + ? tables + : filtered; + cnt = 0; + // lookup all + for (TestTable tt : tts) { + if (index.contains(tt)) { + cnt += 1; + } + } + Assert.assertEquals(cnt, tts.size()); + } + chrono.stop(); + LOGGER.info("lookup elapse " + chrono.elapse()); + } + + private static ImmutablePair<String,String> tableKey(TestTable tt) { + return new ImmutablePair<>(tt.getDbName(), tt.getTableName()); + } + + @Test + public void testKeyPairLookup() { + System.gc(); + Chrono chrono = new Chrono(); + chrono.start(); + for(int l = 0; l < NLOOPS; ++l) { + Set<ImmutablePair<String, String>> index = tables.stream() + .map(TestTablePermissionFilterAlgos::tableKey) + .collect(Collectors.toSet()); + List<TestTable> tts = l % 2 == 1 + ? tables + : filtered; + int cnt = 0; + // lookup all + for (TestTable tt : tts) { + if (index.contains(tableKey(tt))) { + cnt += 1; + } + } + Assert.assertEquals(cnt, tts.size()); + } + chrono.stop(); + LOGGER.info("set pair elapse " + chrono.elapse()); + } + + @Test + public void testTestTableLookup() { + System.gc(); + Chrono chrono = new Chrono(); + chrono.start(); + for(int l = 0; l < NLOOPS; ++l) { + Set<TestTable> index = new HashSet<>(tables); + List<TestTable> tts = l % 2 == 1 + ? tables + : filtered; + int cnt = 0; + // lookup all + for (TestTable tt : tts) { + if (index.contains(tt)) { + cnt += 1; + } + } + Assert.assertEquals(cnt, tts.size()); + } + chrono.stop(); + LOGGER.info("set table elapse " + chrono.elapse()); + } + + + private static String tableFullName(TestTable tt) { + return tt.getDbName() + "." + tt.getTableName(); + } + + @Test + public void testFullNameLookup() { + System.gc(); + Chrono chrono = new Chrono(); + chrono.start(); + for(int l = 0; l < NLOOPS; ++l) { + Set<String> index = tables.stream() + .map(TestTablePermissionFilterAlgos::tableFullName) + .collect(Collectors.toSet()); + List<TestTable> tts = l % 2 == 1 + ? tables + : filtered; + int cnt = 0; + // lookup all + for (TestTable tt : tts) { + if (index.contains(tableFullName(tt))) { + cnt += 1; + } + } + Assert.assertEquals(cnt, tts.size()); + } + chrono.stop(); + LOGGER.info("set fullname elapse " + chrono.elapse()); + } +}