Repository: hive Updated Branches: refs/heads/master 0a9fabbd0 -> 1253450e0
HIVE-15267 Make query length calculation logic more accurate in TxnUtils.needNewQuery() (Steve Yeom, reviewed by Eugene Koifman) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1253450e Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1253450e Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1253450e Branch: refs/heads/master Commit: 1253450e061dd9ff6c868a0a1512bea9532a48bd Parents: 0a9fabb Author: Eugene Koifman <ekoif...@hortonworks.com> Authored: Fri Oct 13 08:44:13 2017 -0700 Committer: Eugene Koifman <ekoif...@hortonworks.com> Committed: Fri Oct 13 08:44:13 2017 -0700 ---------------------------------------------------------------------- .../hadoop/hive/metastore/txn/TxnUtils.java | 214 +++++++++++++------ .../hadoop/hive/metastore/txn/TestTxnUtils.java | 24 +-- 2 files changed, 157 insertions(+), 81 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/1253450e/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java index 7579ae8..2eb967e 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnUtils.java @@ -144,88 +144,150 @@ public class TxnUtils { } /** - * Build a query (or queries if one query is too big) with specified "prefix" and "suffix", - * while populating the IN list into multiple OR clauses, e.g. id in (1,2,3) OR id in (4,5,6) - * For NOT IN case, NOT IN list is broken into multiple AND clauses. - * @param queries array of complete query strings - * @param prefix part of the query that comes before IN list - * @param suffix part of the query that comes after IN list - * @param inList the list containing IN list values - * @param inColumn column name of IN list operator - * @param addParens add a pair of parenthesis outside the IN lists - * e.g. ( id in (1,2,3) OR id in (4,5,6) ) - * @param notIn clause to be broken up is NOT IN + * Build a query (or queries if one query is too big but only for the case of 'IN' + * composite clause. For the case of 'NOT IN' clauses, multiple queries change + * the semantics of the intended query. + * E.g., Let's assume that input "inList" parameter has [5, 6] and that + * _DIRECT_SQL_MAX_QUERY_LENGTH_ configuration parameter only allows one value in a 'NOT IN' clause, + * Then having two delete statements changes the semantics of the inteneded SQL statement. + * I.e. 'delete from T where a not in (5)' and 'delete from T where a not in (6)' sequence + * is not equal to 'delete from T where a not in (5, 6)'.) + * with one or multiple 'IN' or 'NOT IN' clauses with the given input parameters. + * + * Note that this method currently support only single column for + * IN/NOT IN clauses and that only covers OR-based composite 'IN' clause and + * AND-based composite 'NOT IN' clause. + * For example, for 'IN' clause case, the method will build a query with OR. + * E.g., "id in (1,2,3) OR id in (4,5,6)". + * For 'NOT IN' case, NOT IN list is broken into multiple 'NOT IN" clauses connected by AND. + * + * Note that, in this method, "a composite 'IN' clause" is defined as "a list of multiple 'IN' + * clauses in a query". + * + * @param queries OUT: Array of query strings + * @param prefix IN: Part of the query that comes before IN list + * @param suffix IN: Part of the query that comes after IN list + * @param inList IN: the list with IN list values + * @param inColumn IN: single column name of IN list operator + * @param addParens IN: add a pair of parenthesis outside the IN lists + * e.g. "(id in (1,2,3) OR id in (4,5,6))" + * @param notIn IN: is this for building a 'NOT IN' composite clause? */ - public static void buildQueryWithINClause(Configuration conf, List<String> queries, StringBuilder prefix, - StringBuilder suffix, List<Long> inList, - String inColumn, boolean addParens, boolean notIn) { - if (inList == null || inList.size() == 0) { + public static void buildQueryWithINClause(Configuration conf, + List<String> queries, + StringBuilder prefix, + StringBuilder suffix, + List<Long> inList, + String inColumn, + boolean addParens, + boolean notIn) { + // Get configuration parameters + int maxQueryLength = MetastoreConf.getIntVar(conf, ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH); + int batchSize = MetastoreConf.getIntVar(conf, ConfVars.DIRECT_SQL_MAX_ELEMENTS_IN_CLAUSE); + + // Check parameter set validity as a public method. + if (inList == null || inList.size() == 0 || maxQueryLength <= 0 || batchSize <= 0) { throw new IllegalArgumentException("The IN list is empty!"); } - int batchSize = MetastoreConf.getIntVar(conf, ConfVars.DIRECT_SQL_MAX_ELEMENTS_IN_CLAUSE); - int numWholeBatches = inList.size() / batchSize; + + // Define constants and local variables. + int inListSize = inList.size(); StringBuilder buf = new StringBuilder(); - buf.append(prefix); - if (addParens) { - buf.append("("); - } - buf.append(inColumn); - if (notIn) { - buf.append(" not in ("); - } else { - buf.append(" in ("); - } - for (int i = 0; i <= numWholeBatches; i++) { - if (i * batchSize == inList.size()) { - // At this point we just realized we don't need another query - break; - } + int cursor4InListArray = 0, // cursor for the "inList" array. + cursor4InClauseElements = 0, // cursor for an element list per an 'IN'/'NOT IN'-clause. + cursor4queryOfInClauses = 0; // cursor for in-clause lists per a query. + boolean nextItemNeeded = true; + boolean newInclausePrefixJustAppended = false; + StringBuilder nextValue = new StringBuilder(""); + StringBuilder newInclausePrefix = + new StringBuilder(notIn ? " and " + inColumn + " not in (": + " or " + inColumn + " in ("); - if (needNewQuery(conf, buf)) { - // Wrap up current query string + // Loop over the given inList elements. + while( cursor4InListArray < inListSize || !nextItemNeeded) { + if (cursor4queryOfInClauses == 0) { + // Append prefix + buf.append(prefix); if (addParens) { - buf.append(")"); + buf.append("("); } - buf.append(suffix); - queries.add(buf.toString()); + buf.append(inColumn); - // Prepare a new query string - buf.setLength(0); - } - - if (i > 0) { if (notIn) { - if (buf.length() == 0) { - buf.append(prefix); - if (addParens) { - buf.append("("); - } - } else { - buf.append(" and "); - } - buf.append(inColumn); buf.append(" not in ("); } else { - if (buf.length() == 0) { - buf.append(prefix); - if (addParens) { - buf.append("("); - } - } else { - buf.append(" or "); - } - buf.append(inColumn); buf.append(" in ("); } + cursor4queryOfInClauses++; + newInclausePrefixJustAppended = false; + } + + // Get the next "inList" value element if needed. + if (nextItemNeeded) { + nextValue.setLength(0); + nextValue.append(String.valueOf(inList.get(cursor4InListArray++))); + nextItemNeeded = false; } - for (int j = i * batchSize; j < (i + 1) * batchSize && j < inList.size(); j++) { - buf.append(inList.get(j)).append(","); + // Compute the size of a query when the 'nextValue' is added to the current query. + int querySize = querySizeExpected(buf.length(), nextValue.length(), suffix.length(), addParens); + + if (querySize > maxQueryLength * 1024) { + // Check an edge case where the DIRECT_SQL_MAX_QUERY_LENGTH does not allow one 'IN' clause with single value. + if (cursor4queryOfInClauses == 1 && cursor4InClauseElements == 0) { + throw new IllegalArgumentException("The current " + ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH.varname + " is set too small to have one IN clause with single value!"); + } + + // Check en edge case to throw Exception if we can not build a single query for 'NOT IN' clause cases as mentioned at the method comments. + if (notIn) { + throw new IllegalArgumentException("The NOT IN list has too many elements for the current " + ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH.varname + "!"); + } + + // Wrap up the current query string since we can not add another "inList" element value. + if (newInclausePrefixJustAppended) { + buf.delete(buf.length()-newInclausePrefix.length(), buf.length()); + } + + buf.setCharAt(buf.length() - 1, ')'); // replace the "commar" to finish a 'IN' clause string. + + if (addParens) { + buf.append(")"); + } + + buf.append(suffix); + queries.add(buf.toString()); + + // Prepare a new query string. + buf.setLength(0); + cursor4queryOfInClauses = cursor4InClauseElements = 0; + querySize = 0; + newInclausePrefixJustAppended = false; + continue; + } else if (cursor4InClauseElements >= batchSize-1 && cursor4InClauseElements != 0) { + // Finish the current 'IN'/'NOT IN' clause and start a new clause. + buf.setCharAt(buf.length() - 1, ')'); // replace the "commar". + buf.append(newInclausePrefix.toString()); + + newInclausePrefixJustAppended = true; + + // increment cursor for per-query IN-clause list + cursor4queryOfInClauses++; + cursor4InClauseElements = 0; + } else { + buf.append(nextValue.toString()).append(","); + nextItemNeeded = true; + newInclausePrefixJustAppended = false; + // increment cursor for elements per 'IN'/'NOT IN' clause. + cursor4InClauseElements++; } - buf.setCharAt(buf.length() - 1, ')'); } + // Finish the last query. + if (newInclausePrefixJustAppended) { + buf.delete(buf.length()-newInclausePrefix.length(), buf.length()); + } + buf.setCharAt(buf.length() - 1, ')'); // replace the commar. if (addParens) { buf.append(")"); } @@ -233,11 +295,25 @@ public class TxnUtils { queries.add(buf.toString()); } - /** Estimate if the size of a string will exceed certain limit */ - private static boolean needNewQuery(Configuration conf, StringBuilder sb) { - int queryMemoryLimit = MetastoreConf.getIntVar(conf, ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH); - // http://www.javamex.com/tutorials/memory/string_memory_usage.shtml - long sizeInBytes = 8 * (((sb.length() * 2) + 45) / 8); - return sizeInBytes / 1024 > queryMemoryLimit; + /* + * Compute and return the size of a query statement with the given parameters as input variables. + * + * @param sizeSoFar size of the current contents of the buf + * @param sizeNextItem size of the next 'IN' clause element value. + * @param suffixSize size of the suffix for a quey statement + * @param addParens Do we add an additional parenthesis? + */ + private static int querySizeExpected(int sizeSoFar, + int sizeNextItem, + int suffixSize, + boolean addParens) { + + int size = sizeSoFar + sizeNextItem + suffixSize; + + if (addParens) { + size++; + } + + return size; } } http://git-wip-us.apache.org/repos/asf/hive/blob/1253450e/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java b/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java index 674084d..7dd268f 100644 --- a/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java +++ b/standalone-metastore/src/test/java/org/apache/hadoop/hive/metastore/txn/TestTxnUtils.java @@ -58,7 +58,7 @@ public class TestTxnUtils { MetastoreConf.setLongVar(conf, ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH, 1); MetastoreConf.setLongVar(conf, ConfVars.DIRECT_SQL_MAX_ELEMENTS_IN_CLAUSE, 10); List<Long> inList = new ArrayList<>(); - for (long i = 1; i <= 200; i++) { + for (long i = 1; i <= 189; i++) { inList.add(i); } TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); @@ -68,7 +68,7 @@ public class TestTxnUtils { // Case 2 - Max in list members: 10; Max query string length: 1KB // The first query has 2 full batches, and the second query only has 1 batch which only contains 1 member queries.clear(); - inList.add((long)201); + inList.add((long)190); TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); Assert.assertEquals(2, queries.size()); runAgainstDerby(queries); @@ -77,11 +77,11 @@ public class TestTxnUtils { MetastoreConf.setLongVar(conf, ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH, 1); MetastoreConf.setLongVar(conf, ConfVars.DIRECT_SQL_MAX_ELEMENTS_IN_CLAUSE, 1000); queries.clear(); - for (long i = 202; i <= 1000; i++) { + for (long i = 191; i <= 1000; i++) { inList.add(i); } TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); - Assert.assertEquals(1, queries.size()); + Assert.assertEquals(5, queries.size()); runAgainstDerby(queries); // Case 3.2 - Max in list members: 1000, Max query string length: 10KB, and exact 1000 members in a single IN clause @@ -99,7 +99,7 @@ public class TestTxnUtils { MetastoreConf.setLongVar(conf, ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH, 1); queries.clear(); TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, false); - Assert.assertEquals(2, queries.size()); + Assert.assertEquals(10, queries.size()); runAgainstDerby(queries); MetastoreConf.setLongVar(conf, ConfVars.DIRECT_SQL_MAX_QUERY_LENGTH, 10); queries.clear(); @@ -107,7 +107,13 @@ public class TestTxnUtils { Assert.assertEquals(1, queries.size()); runAgainstDerby(queries); - // Case 4 - Max in list members: 1000; Max query string length: 10KB + // Case 4 - NOT IN list + queries.clear(); + TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, true); + Assert.assertEquals(1, queries.size()); + runAgainstDerby(queries); + + // Case 5 - Max in list members: 1000; Max query string length: 10KB queries.clear(); for (long i = 2001; i <= 4321; i++) { inList.add(i); @@ -116,12 +122,6 @@ public class TestTxnUtils { Assert.assertEquals(3, queries.size()); runAgainstDerby(queries); - // Case 5 - NOT IN list - queries.clear(); - TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix, inList, "TXN_ID", true, true); - Assert.assertEquals(3, queries.size()); - runAgainstDerby(queries); - // Case 6 - No parenthesis queries.clear(); suffix.setLength(0);